Datasets Exploration using Pandas / Numpy / Plotly

In [28]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
import sklearn
from sklearn.datasets import load_iris, load_diabetes

# Iris

In [29]:
iris = load_iris()

In [30]:
X, y, names = iris['data'], iris['target'], iris['feature_names']

In [31]:
fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='sepal length',
                                 values=X[:,0]),
                            dict(label='sepal width',
                                 values=X[:,1]),
                            dict(label='petal length',
                                 values=X[:,2]),
                            dict(label='petal width',
                                 values=X[:,3])],
                text=y,
                marker=dict(color=y,
                            showscale=False, # colors encode categorical variables
                            line_color='white', line_width=0.5)
                ))


fig.update_layout(
    title='Iris Data set',
    dragmode='select',
    width=950,
    height=800,
    hovermode='closest',
)

fig.show()

In [57]:
df = pd.DataFrame(np.concatenate((X,y.reshape(len(y),1)), axis=1), columns=names + ['target'])

In [74]:
def df_from_xy(X, y, feature_names):
    return pd.DataFrame(np.concatenate((X,y.reshape(len(y),1)), axis=1), columns= feature_names + ['target'])

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
classes              150 non-null float64
Petal Width Bins     150 non-null int64
Sepal Width Bins     150 non-null int64
dtypes: float64(5), int64(2)
memory usage: 8.3 KB


In [63]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),classes,Petal Width Bins,Sepal Width Bins
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0,0.973333,0.786667
std,0.828066,0.435866,1.765298,0.762238,0.819232,0.802234,0.608515
min,4.3,2.0,1.0,0.1,0.0,0.0,0.0
25%,5.1,2.8,1.6,0.3,0.0,0.0,0.0
50%,5.8,3.0,4.35,1.3,1.0,1.0,1.0
75%,6.4,3.3,5.1,1.8,2.0,2.0,1.0
max,7.9,4.4,6.9,2.5,2.0,2.0,2.0


In [59]:
# Create discrete data from continuous data
df['Petal Width Bins'] = pd.cut(df['petal width (cm)'], 3, labels=False)
df['Sepal Width Bins'] = pd.cut(df['sepal width (cm)'], 3, labels=False)

In [60]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),classes,Petal Width Bins,Sepal Width Bins
0,5.1,3.5,1.4,0.2,0.0,0,1
1,4.9,3.0,1.4,0.2,0.0,0,1
2,4.7,3.2,1.3,0.2,0.0,0,1
3,4.6,3.1,1.5,0.2,0.0,0,1
4,5.0,3.6,1.4,0.2,0.0,0,1
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0,2,1
146,6.3,2.5,5.0,1.9,2.0,2,0
147,6.5,3.0,5.2,2.0,2.0,2,1
148,6.2,3.4,5.4,2.3,2.0,2,1


In [64]:
# Categorical analysis
df['Petal Width Bins'].value_counts()

1    54
0    50
2    46
Name: Petal Width Bins, dtype: int64

In [67]:
# Summary Tables
print(pd.crosstab(df['Petal Width Bins'], df['classes']))
print('-------------------------------')
print(pd.crosstab(df['Sepal Width Bins'], df['classes'], margins=True))

classes           0.0  1.0  2.0
Petal Width Bins               
0                  50    0    0
1                   0   49    5
2                   0    1   45
-------------------------------
classes           0.0  1.0  2.0  All
Sepal Width Bins                    
0                   1   27   19   47
1                  36   23   29   88
2                  13    0    2   15
All                50   50   50  150


In [66]:
df.pivot_table(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'],
               ['classes'], aggfunc='mean')

Unnamed: 0_level_0,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm)
classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1.462,0.246,5.006,3.428
1.0,4.26,1.326,5.936,2.77
2.0,5.552,2.026,6.588,2.974


# Diabetes

In [68]:
diabetes = load_diabetes()

In [69]:
X, y, names = diabetes['data'], diabetes['target'], diabetes['feature_names']

In [70]:
X.shape

(442, 10)

In [75]:
df = df_from_xy(X,y,names)

In [76]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [86]:
df.describe().round(4)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,152.1335
std,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,77.093
min,-0.1072,-0.0446,-0.0903,-0.1124,-0.1268,-0.1156,-0.1023,-0.0764,-0.1261,-0.1378,25.0
25%,-0.0373,-0.0446,-0.0342,-0.0367,-0.0342,-0.0304,-0.0351,-0.0395,-0.0332,-0.0332,87.0
50%,0.0054,-0.0446,-0.0073,-0.0057,-0.0043,-0.0038,-0.0066,-0.0026,-0.0019,-0.0011,140.5
75%,0.0381,0.0507,0.0312,0.0356,0.0284,0.0298,0.0293,0.0343,0.0324,0.0279,211.5
max,0.1107,0.0507,0.1706,0.132,0.1539,0.1988,0.1812,0.1852,0.1336,0.1356,346.0


In [89]:
# Dataset has already been mean centered (Standardized)

In [90]:
# Correlation relationship
df.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
age,1.0,0.173737,0.185085,0.335427,0.260061,0.219243,-0.075181,0.203841,0.270777,0.301731,0.187889
sex,0.173737,1.0,0.088161,0.241013,0.035277,0.142637,-0.37909,0.332115,0.149918,0.208133,0.043062
bmi,0.185085,0.088161,1.0,0.395415,0.249777,0.26117,-0.366811,0.413807,0.446159,0.38868,0.58645
bp,0.335427,0.241013,0.395415,1.0,0.24247,0.185558,-0.178761,0.257653,0.393478,0.390429,0.441484
s1,0.260061,0.035277,0.249777,0.24247,1.0,0.896663,0.051519,0.542207,0.515501,0.325717,0.212022
s2,0.219243,0.142637,0.26117,0.185558,0.896663,1.0,-0.196455,0.659817,0.318353,0.2906,0.174054
s3,-0.075181,-0.37909,-0.366811,-0.178761,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697,-0.394789
s4,0.203841,0.332115,0.413807,0.257653,0.542207,0.659817,-0.738493,1.0,0.617857,0.417212,0.430453
s5,0.270777,0.149918,0.446159,0.393478,0.515501,0.318353,-0.398577,0.617857,1.0,0.46467,0.565883
s6,0.301731,0.208133,0.38868,0.390429,0.325717,0.2906,-0.273697,0.417212,0.46467,1.0,0.382483


In [84]:
# Count the NA values per feature
df.isna().sum(axis = 0)

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [95]:
def histogram(feature):
    fig = go.Figure(data=go.Histogram(x=feature))
    fig.show()

In [96]:
histogram(df['age'])