In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib

train = pd.read_csv("random_tree_data/train_data.csv")
test = pd.read_csv("random_tree_data/test_data.csv")

### Defining used columns and target classes

In [None]:
features = ['gender', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)',
            'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)',
            'systolic', 'relaxation', 'fasting_blood_sugar', 'cholesterol',
            'triglyceride', 'hdl', 'ldl', 'hemoglobin', 'urine_protein',
            'serum_creatinine', 'ast', 'alt', 'gtp', 'dental_caries', 'tartar',
            'bmi', 'healthy_weight', 'eyesight_total', 'vision', 'hearing_impairment']
classNames=["not smoking","smoking"]
Y_train = train["smoking"]
X_train = train[features]
X_train

### Fitting our split data into the model
**We use random_state=1337 to have reproducible results**

In [None]:
model = RandomForestClassifier(random_state=1337,n_estimators = 100)
model.fit(X_train, Y_train)

### Applying model on our test data

In [None]:
X_test = pd.get_dummies(test[features])
X_test.count()
features=X_train.columns

In [None]:
predictions = model.predict(X_test)
accuracy_score(test["smoking"], predictions)

In [None]:
report = classification_report(test["smoking"], predictions)
print(report)

In [None]:
confmatrix=confusion_matrix(test["smoking"], predictions)
print(confmatrix)

In [None]:
model = RandomForestClassifier(random_state=1337, n_estimators=10)
model.fit(X_train, Y_train)
X_test = pd.get_dummies(test[features])
X_test.count()
features = X_train.columns
predictions = model.predict(X_test)
accuracy_score(test["smoking"], predictions)
report = classification_report(test["smoking"], predictions)
confmatrix=confusion_matrix(test["smoking"], predictions)
print(report)
print(confmatrix)

In [None]:
decision_tree = DecisionTreeClassifier(max_depth = 3)
treemodel = decision_tree.fit(X_train, Y_train)

In [None]:
predictions_tree = treemodel.predict(X_test)

In [None]:
accuracy_score(test["smoking"], predictions_tree)

In [None]:
matplotlib.pyplot.figure(figsize=(18,18))
plot_tree(treemodel,feature_names = features,
          class_names=classNames,
          filled = True,fontsize=10)
matplotlib.pyplot.savefig('decisionTree.png')

In [None]:
decision_tree = DecisionTreeClassifier()
treemodel = decision_tree.fit(X_train, Y_train)
predictions_tree = treemodel.predict(X_test)
accuracy_score(test["smoking"], predictions_tree)

In [None]:
#keep probabilities for the positive outcome only
y_pred_dt_prob = treemodel.predict_proba(X_test.values)[:,1]
fpr, tpr, _thresholds  = roc_curve(y_test, y_pred_dt_prob, pos_label=treemodel.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()