Test conventional Machine Learning Methods

In [1]:
import pandas as pd
import numpy as np

RANDOM_SEED = 42

np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

train_df = pd.read_csv('../data/train.csv')
valid_df = pd.read_csv('../data/valid.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  207 non-null    int64  
 1   age         207 non-null    float64
 2   sex         207 non-null    float64
 3   cp          207 non-null    float64
 4   trestbps    207 non-null    float64
 5   chol        207 non-null    float64
 6   fbs         207 non-null    float64
 7   restecg     207 non-null    float64
 8   thalach     207 non-null    float64
 9   exang       207 non-null    float64
 10  oldpeak     207 non-null    float64
 11  slope       207 non-null    float64
 12  ca          207 non-null    float64
 13  thal        207 non-null    float64
 14  num         207 non-null    int64  
dtypes: float64(13), int64(2)
memory usage: 24.4 KB


Decision Trees

In [2]:
from sklearn import tree

# criterion 'entropy' and max_depth=3 give the best results
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=RANDOM_SEED)
clf.fit(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())

train_score = clf.score(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())
valid_score = clf.score(valid_df.iloc[:, :13].to_numpy(), valid_df.iloc[:, 14].to_numpy())
test_score = clf.score(test_df.iloc[:, :13].to_numpy(), test_df.iloc[:, 14].to_numpy())

In [3]:
print(clf.get_depth())

print(clf.feature_importances_)

print(train_score)
print(valid_score)
print(test_score)

3
[0.00 0.08 0.00 0.39 0.00 0.08 0.00 0.00 0.00 0.00 0.30 0.06 0.09]
0.6666666666666666
0.4666666666666667
0.5333333333333333


Random Forests

In [4]:
from sklearn import ensemble

clf = ensemble.RandomForestClassifier(criterion='entropy', max_depth=5, random_state=RANDOM_SEED)
clf.fit(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())

train_score = clf.score(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())
valid_score = clf.score(valid_df.iloc[:, :13].to_numpy(), valid_df.iloc[:, 14].to_numpy())
test_score = clf.score(test_df.iloc[:, :13].to_numpy(), test_df.iloc[:, 14].to_numpy())

print(clf.feature_importances_)

print(train_score)
print(valid_score)
print(test_score)


[0.07 0.09 0.03 0.15 0.08 0.10 0.02 0.02 0.12 0.04 0.14 0.04 0.11]
0.8985507246376812
0.5555555555555556
0.6


AdaBoost Classifier

In [5]:
clf = ensemble.AdaBoostClassifier(learning_rate=0.03, random_state=RANDOM_SEED)
clf.fit(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())

train_score = clf.score(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())
valid_score = clf.score(valid_df.iloc[:, :13].to_numpy(), valid_df.iloc[:, 14].to_numpy())
test_score = clf.score(test_df.iloc[:, :13].to_numpy(), test_df.iloc[:, 14].to_numpy())

print(clf.feature_importances_)

print(train_score)
print(valid_score)
print(test_score)

[0.00 0.00 0.00 0.44 0.00 0.00 0.00 0.00 0.04 0.00 0.32 0.00 0.20]
0.6280193236714976
0.5333333333333333
0.6


SVM

In [6]:
from sklearn import svm

clf = svm.SVC(kernel='linear', probability=True, random_state=RANDOM_SEED)
clf.fit(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())

train_score = clf.score(train_df.iloc[:, :13].to_numpy(), train_df.iloc[:, 14].to_numpy())
valid_score = clf.score(valid_df.iloc[:, :13].to_numpy(), valid_df.iloc[:, 14].to_numpy())
test_score = clf.score(test_df.iloc[:, :13].to_numpy(), test_df.iloc[:, 14].to_numpy())

print(train_score)
print(valid_score)
print(test_score)

0.7053140096618358
0.5555555555555556
0.6222222222222222
