In [1]:
#Dependencies
from sklearn import tree
import pandas as pd
import os
import joblib

In [2]:
#Import CSV
df = pd.read_csv('Datasets/cleaned_data.csv')
df.head()

Unnamed: 0,ID,Gender,Age,Caffeine (mg),Alcohol (gm),Weight (kg),Standing Height (cm),Pulse regular or irregular,Systolic: Blood pressure mm Hg,Direct HDL-Cholesterol (mg/dL),Avg Drinks per Day,Smoker?
0,73557,0,69,203.0,0.0,78.3,171.3,0,122.0,65.0,1.0,0.0
1,73558,0,54,240.0,119.0,89.5,176.8,0,156.0,50.0,4.0,1.0
2,73562,0,56,144.0,0.0,105.0,158.7,0,160.0,38.0,1.0,0.0
3,73566,1,56,266.0,22.3,61.8,152.8,0,128.0,59.0,1.0,1.0
4,73567,0,65,43.0,39.1,65.3,172.4,0,140.0,79.0,3.0,1.0


In [3]:
#Target df
target = df['Pulse regular or irregular']
target_names = ['regular', 'irregular']

In [4]:
#Variable df
data = df.drop(['Pulse regular or irregular', 'ID'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Gender,Age,Caffeine (mg),Alcohol (gm),Weight (kg),Standing Height (cm),Systolic: Blood pressure mm Hg,Direct HDL-Cholesterol (mg/dL),Avg Drinks per Day,Smoker?
0,0,69,203.0,0.0,78.3,171.3,122.0,65.0,1.0,0.0
1,0,54,240.0,119.0,89.5,176.8,156.0,50.0,4.0,1.0
2,0,56,144.0,0.0,105.0,158.7,160.0,38.0,1.0,0.0
3,1,56,266.0,22.3,61.8,152.8,128.0,59.0,1.0,1.0
4,0,65,43.0,39.1,65.3,172.4,140.0,79.0,3.0,1.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=74)

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9410112359550562

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9662921348314607

In [8]:
model = rf.fit(X_train, y_train)

In [9]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.1730966429522541, 'Standing Height (cm)'),
 (0.16921384616625348, 'Weight (kg)'),
 (0.15570386253498864, 'Age'),
 (0.14073439280827985, 'Caffeine (mg)'),
 (0.11748793274277008, 'Systolic: Blood pressure mm Hg'),
 (0.10327707227823883, 'Direct HDL-Cholesterol (mg/dL)'),
 (0.05962155081439755, 'Alcohol (gm)'),
 (0.04960992836291953, 'Avg Drinks per Day'),
 (0.018003692018943248, 'Smoker?'),
 (0.013251079320954753, 'Gender')]

In [10]:
predictions = rf.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1216,0,1
572,0,0
740,0,0
349,0,1
12,0,0
...,...,...
466,0,0
822,0,0
399,0,0
475,0,0


In [11]:
filename = 'finalized_model.sav'
joblib.dump(model, filename)

['finalized_model.sav']

In [12]:
#Import reduced CSV
df_reduced = pd.read_csv('Datasets/cleaned_data_reduced.csv')
df_reduced.head()

Unnamed: 0,ID,Gender,Age,Caffeine (mg),Alcohol (gm),Weight (kg),Standing Height (cm),Pulse regular or irregular,Systolic: Blood pressure mm Hg,Direct HDL-Cholesterol (mg/dL)
0,73557,0,69,203.0,0.0,78.3,171.3,0,122.0,65.0
1,73558,0,54,240.0,119.0,89.5,176.8,0,156.0,50.0
2,73559,0,72,45.0,0.0,88.9,175.3,0,140.0,60.0
3,73560,0,9,0.0,0.0,32.2,137.3,0,108.0,61.0
4,73561,1,73,24.0,0.0,52.0,162.4,0,136.0,85.0


In [13]:
#Target df
target2 = df_reduced['Pulse regular or irregular']
target_names2 = ['regular', 'irregular']

In [14]:
#Variable df
data2 = df_reduced.drop(['Pulse regular or irregular', 'ID'], axis=1)
feature_names2 = data2.columns
data2.head()

Unnamed: 0,Gender,Age,Caffeine (mg),Alcohol (gm),Weight (kg),Standing Height (cm),Systolic: Blood pressure mm Hg,Direct HDL-Cholesterol (mg/dL)
0,0,69,203.0,0.0,78.3,171.3,122.0,65.0
1,0,54,240.0,119.0,89.5,176.8,156.0,50.0
2,0,72,45.0,0.0,88.9,175.3,140.0,60.0
3,0,9,0.0,0.0,32.2,137.3,108.0,61.0
4,1,73,24.0,0.0,52.0,162.4,136.0,85.0


In [15]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data2, target2, random_state=74)

In [16]:
clf2 = tree.DecisionTreeClassifier()
clf2 = clf2.fit(X_train2, y_train2)
clf2.score(X_test2, y_test2)

0.9673842139595564

In [17]:
rf2 = RandomForestClassifier(n_estimators=200)
rf2 = rf2.fit(X_train2, y_train2)
rf2.score(X_test2, y_test2)

0.9804305283757339

In [18]:
sorted(zip(rf2.feature_importances_, feature_names), reverse=True)

[(0.1805430091753232, 'Standing Height (cm)'),
 (0.1700588217057279, 'Weight (kg)'),
 (0.15787461471325434, 'Caffeine (mg)'),
 (0.14774455863525274, 'Age'),
 (0.14198165770561458, 'Direct HDL-Cholesterol (mg/dL)'),
 (0.1223436906635673, 'Systolic: Blood pressure mm Hg'),
 (0.06291005978469727, 'Alcohol (gm)'),
 (0.01654358761656265, 'Gender')]

In [19]:
predictions2 = rf2.predict(X_test2)
pd.DataFrame({"Prediction": predictions2, "Actual": y_test2})

Unnamed: 0,Prediction,Actual
3762,0,0
4291,0,0
3485,0,0
4006,0,0
3498,0,0
...,...,...
4751,0,0
1600,0,0
1287,0,0
3021,0,0
