In [6]:
import pandas as pnd
import numpy as np
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import operator

from sklearn.cross_validation import KFold

import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn import tree
import seaborn as sns

from IPython.display import Image

%matplotlib inline

In [7]:
# function for confusion matrix building
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    

def visualize_tree(tr, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        tree.export_graphviz(tr, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]

In [20]:
#  add the rows names
header_row = ['age','sex','chest_pain','blood pressure','serum_cholestoral','fasting_blood_sugar',\
               'electrocardiographic','max_heart_rate','induced_angina','ST_depression','slope','vessels','thal','diagnosis']

# read csv file with Cleveland heart diseases data
heart = pnd.read_csv('../data/tubes2_HeartDisease_train.csv', skiprows=1,names=header_row)
heart[:5]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


In [21]:
len(heart)

779

In [22]:
# get statistics values
heart.describe()

Unnamed: 0,age,sex,chest_pain,diagnosis
count,779.0,779.0,779.0,779.0
mean,53.509628,0.779204,3.264442,0.98973
std,9.505017,0.41505,0.926284,1.138211
min,28.0,0.0,1.0,0.0
25%,47.0,1.0,3.0,0.0
50%,54.0,1.0,4.0,1.0
75%,60.0,1.0,4.0,2.0
max,77.0,1.0,4.0,4.0


In [23]:
# get number of people with symptom X_k
names_descr = dict()
categorical_columns = ["sex", "chest_pain", "fasting_blood_sugar", "electrocardiographic", "induced_angina", "slope", "vessels", \
                       "thal", "diagnosis"]
for c in categorical_columns:
    print  heart.groupby([c])["age"].count()

sex
0    172
1    607
Name: age, dtype: int64
chest_pain
1     38
2    144
3    171
4    426
Name: age, dtype: int64
fasting_blood_sugar
0    590
1    111
?     78
Name: age, dtype: int64
electrocardiographic
0    469
1    147
2    161
?      1
Name: age, dtype: int64
induced_angina
0    447
1    288
?     44
Name: age, dtype: int64
slope
1    176
2    288
3     53
?    262
Name: age, dtype: int64
vessels
0    153
1     59
2     36
3     17
?    514
Name: age, dtype: int64
thal
3    173
6     39
7    159
?    408
Name: age, dtype: int64
diagnosis
0    349
1    225
2     92
3     90
4     23
Name: age, dtype: int64


In [24]:
# we have unknown values '?'
# change unrecognized value '?' into mean value through the column
for c in heart.columns[:-1]:
    heart[c] = heart[c].apply(lambda x: heart[heart[c]!='?'][c].astype(float).mean() if x == "?" else x)
    heart[c] = heart[c].astype(float)

In [25]:
# print the types of heart disease
set(heart.loc[:, "diagnosis"].values)

{0, 1, 2, 3, 4}

In [27]:
#  let's calculate similarities between 1,2,3,4 levels 
# we will use euclidean distance 
vecs_1 = heart[heart["diagnosis"] == 1 ].median().values[:-2]
vecs_2 = heart[heart["diagnosis"] == 2 ].median().values[:-2]
vecs_3 = heart[heart["diagnosis"] == 3 ].median().values[:-2]
vecs_4 = heart[heart["diagnosis"] == 4 ].median().values[:-2]

In [28]:
vecs_1

array([ 54.        ,   1.        ,   4.        , 130.        ,
       229.        ,   0.        ,   0.        , 131.        ,
         1.        ,   1.        ,   2.        ,   0.68679245])

In [30]:
print "Similarity between type 1 and type 2 is ", np.linalg.norm(vecs_1-vecs_2)
print "Similarity between type 1 and type 3 is ", np.linalg.norm(vecs_1-vecs_3)
print "Similarity between type 1 and type 4 is ", np.linalg.norm(vecs_1-vecs_4)
print "Similarity between type 2 and type 3 is ", np.linalg.norm(vecs_2-vecs_3)
print "Similarity between type 2 and type 4 is ", np.linalg.norm(vecs_2-vecs_4)
print "Similarity between type 3 and type 4 is ", np.linalg.norm(vecs_3-vecs_4)

Similarity between type 1 and type 2 is  32.844300767565855
Similarity between type 1 and type 3 is  21.13757142758485
Similarity between type 1 and type 4 is  10.228797713547035
Similarity between type 2 and type 3 is  14.364536497079554
Similarity between type 2 and type 4 is  33.73320874446489
Similarity between type 3 and type 4 is  20.646746448736206


In [31]:
sim = {"(1,2)": np.linalg.norm(vecs_1-vecs_2), \
       "(1,3)": np.linalg.norm(vecs_1-vecs_3),\
       "(1,4)": np.linalg.norm(vecs_1-vecs_4),\
       "(2,3)": np.linalg.norm(vecs_2-vecs_3),\
       "(2,4)": np.linalg.norm(vecs_2-vecs_4),\
       "(3,4)": np.linalg.norm(vecs_3-vecs_4)    
      }

In [32]:
# sort by the closest types
sorted_sim = sorted(sim.items(), key=operator.itemgetter(1))
sorted_sim

[('(1,4)', 10.228797713547035),
 ('(2,3)', 14.364536497079554),
 ('(3,4)', 20.646746448736206),
 ('(1,3)', 21.13757142758485),
 ('(1,2)', 32.844300767565855),
 ('(2,4)', 33.73320874446489)]

In [33]:
# we can compare the types of heart disease by using values of each feature separately 

heart_d = heart[heart["diagnosis"] >= 1 ]
heart_d[:5]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,54.0,1.0,4.0,125.0,216.0,0.0,0.0,140.0,0.0,0.0,1.762089,0.686792,5.02965,1
1,55.0,1.0,4.0,158.0,217.0,0.0,0.0,110.0,1.0,2.5,2.0,0.686792,5.02965,1
4,50.0,1.0,4.0,120.0,0.0,0.0,1.0,156.0,1.0,0.0,1.0,0.686792,6.0,3
6,63.0,1.0,4.0,130.0,308.0,0.0,0.0,138.0,1.0,2.0,2.0,0.686792,5.02965,2
10,58.0,1.0,4.0,115.0,0.0,0.158345,0.0,138.0,0.0,0.5,1.0,0.686792,5.02965,1


In [34]:
print "Minimum age to Maximum age per disease type"

heart_d.groupby(["diagnosis", ])["age"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis", ])["age"].max().astype(str)

Minimum age to Maximum age per disease type


diagnosis
1    31.0, 75.0
2    38.0, 74.0
3    35.0, 77.0
4    38.0, 77.0
Name: age, dtype: object

In [35]:
print "Mean age per disease type"
heart_d.groupby(["diagnosis", ])["age"].mean()

Mean age per disease type


diagnosis
1    53.435556
2    57.695652
3    59.500000
4    60.217391
Name: age, dtype: float64

In [36]:
print "Count each sex per heart disease type"
heart_d.groupby(["diagnosis", "sex"])["age"].count()

Count each sex per heart disease type


diagnosis  sex
1          0.0     27
           1.0    198
2          0.0      8
           1.0     84
3          0.0      8
           1.0     82
4          0.0      2
           1.0     21
Name: age, dtype: int64

In [37]:
print "Count each chest pain value per heart disease type"
heart_d.groupby(["diagnosis", "chest_pain"])["age"].count()

Count each chest pain value per heart disease type


diagnosis  chest_pain
1          1.0             9
           2.0            16
           3.0            29
           4.0           171
2          1.0             3
           2.0             1
           3.0            12
           4.0            76
3          1.0             3
           2.0             3
           3.0            14
           4.0            70
4          1.0             1
           3.0             4
           4.0            18
Name: age, dtype: int64

In [38]:
print "Minimum blood pressure to Maximum  blood pressure per disease type"

heart_d.groupby(["diagnosis"])["blood pressure"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["blood pressure"].max().astype(str)


Minimum blood pressure to Maximum  blood pressure per disease type


diagnosis
1     92.0, 200.0
2     95.0, 180.0
3      0.0, 200.0
4    104.0, 190.0
Name: blood pressure, dtype: object

In [39]:
print "Mean blood pressure per disease type"
heart_d.groupby(["diagnosis", ])["blood pressure"].mean()

Mean blood pressure per disease type


diagnosis
1    133.001166
2    134.497565
3    134.684639
4    139.683060
Name: blood pressure, dtype: float64

In [40]:
print "Minimum serum_cholestoral to Maximum serum_cholestoral per disease type"

heart_d.groupby(["diagnosis"])["serum_cholestoral"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["serum_cholestoral"].max().astype(str)

Minimum serum_cholestoral to Maximum serum_cholestoral per disease type


diagnosis
1    0.0, 529.0
2    0.0, 384.0
3    0.0, 369.0
4    0.0, 407.0
Name: serum_cholestoral, dtype: object

In [41]:
print "Mean serum_cholestoral per disease type"
heart_d.groupby(["diagnosis", ])["serum_cholestoral"].mean()

Mean serum_cholestoral per disease type


diagnosis
1    196.615776
2    149.398042
3    155.925666
4    205.361301
Name: serum_cholestoral, dtype: float64

In [42]:
print "Count each fasting_blood_sugar per heart disease type"
heart_d.groupby(["diagnosis", "fasting_blood_sugar"])["age"].count()

Count each fasting_blood_sugar per heart disease type


diagnosis  fasting_blood_sugar
1          0.000000               163
           0.158345                34
           1.000000                28
2          0.000000                53
           0.158345                19
           1.000000                20
3          0.000000                57
           0.158345                11
           1.000000                22
4          0.000000                18
           0.158345                 1
           1.000000                 4
Name: age, dtype: int64

In [43]:
print "Count each electrocardiographic per heart disease type"
heart_d.groupby(["diagnosis", "electrocardiographic"])["age"].count()

Count each electrocardiographic per heart disease type


diagnosis  electrocardiographic
1          0.000000                146
           0.603604                  1
           1.000000                 41
           2.000000                 36
2          0.000000                 47
           1.000000                 25
           2.000000                 20
3          0.000000                 42
           1.000000                 25
           2.000000                 23
4          0.000000                  6
           1.000000                  6
           2.000000                 11
Name: age, dtype: int64

In [44]:
print "Minimum max_heart_rate to Maximum max_heart_rate per disease type"

heart_d.groupby(["diagnosis"])["max_heart_rate"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["max_heart_rate"].max().astype(str)

Minimum max_heart_rate to Maximum max_heart_rate per disease type


diagnosis
1    72.0, 195.0
2    60.0, 180.0
3    63.0, 173.0
4    84.0, 182.0
Name: max_heart_rate, dtype: object

In [45]:
print "Mean max_heart_rate per disease type"
heart_d.groupby(["diagnosis", ])["max_heart_rate"].mean()

Mean max_heart_rate per disease type


diagnosis
1    132.103020
2    129.036883
3    123.609237
4    129.899852
Name: max_heart_rate, dtype: float64

In [46]:
print "Count  induced_angina per heart disease type"
heart_d.groupby(["diagnosis", "induced_angina"])["age"].count()

Count  induced_angina per heart disease type


diagnosis  induced_angina
1          0.000000           88
           0.391837           12
           1.000000          125
2          0.000000           37
           0.391837            4
           1.000000           51
3          0.000000           25
           0.391837           11
           1.000000           54
4          0.000000            9
           0.391837            2
           1.000000           12
Name: age, dtype: int64

In [47]:
print "Count  mean ST_depression per heart disease type"
heart_d.groupby(["diagnosis"])["ST_depression"].mean()

Count  mean ST_depression per heart disease type


diagnosis
1     3.754327
2     6.107061
3     5.802068
4    10.529339
Name: ST_depression, dtype: float64

In [48]:
heart_d[:2]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,54.0,1.0,4.0,125.0,216.0,0.0,0.0,140.0,0.0,0.0,1.762089,0.686792,5.02965,1
1,55.0,1.0,4.0,158.0,217.0,0.0,0.0,110.0,1.0,2.5,2.0,0.686792,5.02965,1


In [49]:
print "Count slope per heart disease type"
heart_d.groupby(["diagnosis", "slope"])["age"].count()

Count slope per heart disease type


diagnosis  slope   
1          1.000000     33
           1.762089     60
           2.000000    122
           3.000000     10
2          1.000000     15
           1.762089     15
           2.000000     48
           3.000000     14
3          1.000000     16
           1.762089     23
           2.000000     40
           3.000000     11
4          1.000000      3
           1.762089      3
           2.000000     10
           3.000000      7
Name: age, dtype: int64

In [50]:
print "Count  mean vessels per heart disease type"
heart_d.groupby(["diagnosis"])["vessels"].mean()

Count  mean vessels per heart disease type


diagnosis
1    0.711950
2    0.879286
3    0.946751
4    1.097457
Name: vessels, dtype: float64

In [51]:
print "Count  mean thal per heart disease type"
heart_d.groupby(["diagnosis"])["thal"].mean()

Count  mean thal per heart disease type


diagnosis
1    5.320144
2    5.544855
3    5.568074
4    5.665065
Name: thal, dtype: float64

In [52]:
# if "diagnosis" == 0, member does not have disease A - we put 0
# if "diagnosis" >= 1, member possess the disease A - we put 1
heart.loc[:, "diag_int"] = heart.loc[:, "diagnosis"].apply(lambda x: 1 if x >= 1 else 0)

In [53]:
# create the normalizer and fit it 
preprocessing.Normalizer().fit_transform(heart)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').