In [141]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [142]:
cardio_data = pd.read_csv('cardio_train.csv', sep=';')

In [143]:
cardio_data = cardio_data.drop(columns = ['id'])
cardio_data = cardio_data.drop(columns = ['height'])

In [144]:
cardio_data['age'] = cardio_data['age'].map(lambda x: round(x/365))
cardio_data.describe()

Unnamed: 0,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.338686,1.349571,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,6.765294,0.476838,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,30.0,1.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,48.0,1.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,58.0,2.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,65.0,2.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [145]:
def outliers_iqr(ys):
    quartile1, quartile3 = np.percentile(ys, [25,75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - (iqr*3)
    upper_bound = quartile3 + (iqr*3)

    print(f'Q1:{quartile1}, Q3:{quartile3}, IQR:{iqr}')
    print(f'Lower Bound:{lower_bound}, Upper Bound:{upper_bound}')

    result = np.where((ys > upper_bound) | (ys < lower_bound))
    boundary = (lower_bound, upper_bound)

    print(f'Number of outliers: {len(result[0])}')

    return result, boundary
def handel_outliers(dataframe, column):
    lower_bound = outliers_iqr(dataframe[column])[1][0]
    upper_bound = outliers_iqr(dataframe[column])[1][1]
    dataframe.loc[(dataframe[column] < lower_bound), column] = lower_bound
    dataframe.loc[(dataframe[column] > upper_bound), column] = upper_bound

In [146]:
handel_outliers(cardio_data, 'weight')
handel_outliers(cardio_data, 'ap_hi')
handel_outliers(cardio_data, 'ap_lo')

Q1:65.0, Q3:82.0, IQR:17.0
Lower Bound:14.0, Upper Bound:133.0
Number of outliers: 171
Q1:65.0, Q3:82.0, IQR:17.0
Lower Bound:14.0, Upper Bound:133.0
Number of outliers: 171
Q1:120.0, Q3:140.0, IQR:20.0
Lower Bound:60.0, Upper Bound:200.0
Number of outliers: 288
Q1:120.0, Q3:140.0, IQR:20.0
Lower Bound:60.0, Upper Bound:200.0
Number of outliers: 288
Q1:80.0, Q3:90.0, IQR:10.0
Lower Bound:50.0, Upper Bound:120.0
Number of outliers: 1136
Q1:80.0, Q3:90.0, IQR:10.0
Lower Bound:50.0, Upper Bound:120.0
Number of outliers: 1136


In [147]:
cardio_data.describe()

Unnamed: 0,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.338686,1.349571,74.166433,126.848314,81.8528,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,6.765294,0.476838,14.192985,17.430948,10.46668,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,30.0,1.0,14.0,60.0,50.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,48.0,1.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,58.0,2.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,65.0,2.0,133.0,200.0,120.0,3.0,3.0,1.0,1.0,1.0,1.0


In [148]:
x=cardio_data.drop('cardio',axis=1)
y=cardio_data.cardio
x_test, x_train, y_test, y_train= train_test_split(x,y,test_size = .33,random_state=42)

In [149]:
from sklearn import tree

In [150]:
model = tree.DecisionTreeClassifier(max_depth=12)
model.fit(x_train,y_train)

In [151]:
model.score(x_train,y_train)

0.7841125541125541

In [152]:
model.score(x_test,y_test)

0.7107889125799574

In [153]:
y_pred = model.predict(x_test)

In [154]:
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))