In [222]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from  sklearn import metrics
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score
from imblearn.over_sampling import SMOTE
import time


Import Data

In [223]:
x_train = pd.read_csv("x_train.csv")#read input data
y_train = pd.read_csv("y_train.csv")#read input data
x_test = pd.read_csv("x_test.csv")#read input data
y_test = pd.read_csv("y_test.csv")#read input data

In [224]:
x_train.drop(columns=x_train.columns[0], axis=1, inplace=True)
y_train.drop(columns=y_train.columns[0], axis=1, inplace=True)
x_test.drop(columns=x_test.columns[0], axis=1, inplace=True)
y_test.drop(columns=y_test.columns[0], axis=1, inplace=True)

Build Model

In [225]:
tic = time.perf_counter()
RandomForest = RandomForestClassifier(n_estimators=1, max_depth=3)
RandomForest.fit(x_train, y_train.values.ravel())
toc = time.perf_counter()
print(f"Model Ran in {toc - tic:0.4f} seconds")
prediction = RandomForest.predict(x_test)

Model Ran in 0.0110 seconds


Precision Accuracy Recall

In [226]:
print("Accuracy:", metrics.accuracy_score(y_test, prediction))
print("Recall - Macro:", metrics.recall_score(y_test, prediction, average="macro"))
print("Precision - Macro:", metrics.precision_score(y_test, prediction, average="macro"))
print("Recall - Micro:", metrics.recall_score(y_test, prediction, average="micro"))
print("Precision - Micro:", metrics.precision_score(y_test, prediction, average="micro"))

Accuracy: 0.8716475095785441
Recall - Macro: 0.7054296257248287
Precision - Macro: 0.708843537414966
Recall - Micro: 0.8716475095785441
Precision - Micro: 0.8716475095785441


  _warn_prf(average, modifier, msg_start, len(result))


View Class Distribution

In [227]:
print(y_train.QOL_Measure.value_counts())
print(y_test.QOL_Measure.value_counts())

5    632
1    262
4    186
3    119
2     17
Name: QOL_Measure, dtype: int64
5    271
1    112
4     80
3     52
2      7
Name: QOL_Measure, dtype: int64


Implement SMOTE on Training Data

In [228]:
sm = SMOTE(random_state=42)
x_res, y_res = sm.fit_resample(x_train, y_train)

View Class Distribution After SMOTE

In [229]:
print(y_res.QOL_Measure.value_counts())
print(y_test.QOL_Measure.value_counts())

5    632
3    632
4    632
2    632
1    632
Name: QOL_Measure, dtype: int64
5    271
1    112
4     80
3     52
2      7
Name: QOL_Measure, dtype: int64


Implement Model

In [230]:
tic = time.perf_counter()
RandomForestSmote = RandomForestClassifier(n_estimators=3, max_depth=3)
RandomForestSmote.fit(x_res, y_res.values.ravel())
toc = time.perf_counter()
print(f"Model Ran in {toc - tic:0.4f} seconds")
predictionSmote = RandomForestSmote.predict(x_test)

Model Ran in 0.0243 seconds


Accuracy Precision Recall

In [231]:
print("Accuracy:", metrics.accuracy_score(y_test, predictionSmote))
print("Recall - Macro:", metrics.recall_score(y_test, predictionSmote, average="macro"))
print("Precision - Macro:", metrics.precision_score(y_test, predictionSmote, average="macro"))
print("Recall - Micro:", metrics.recall_score(y_test, predictionSmote, average="micro"))
print("Precision - Micro:", metrics.precision_score(y_test, predictionSmote, average="micro"))

Accuracy: 0.9176245210727969
Recall - Macro: 0.9209290985767001
Precision - Macro: 0.7923214285714286
Recall - Micro: 0.9176245210727969
Precision - Micro: 0.9176245210727969


Import Reduced Data Set and Apply Smote

In [232]:
x_train_reduced = pd.read_csv("x_train_reduced.csv")#read input data
y_train_reduced = pd.read_csv("y_train_reduced.csv")#read input data
x_test_reduced = pd.read_csv("x_test_reduced.csv")#read input data
y_test_reduced = pd.read_csv("y_test_reduced.csv")#read input data

x_train_reduced.drop(columns=x_train_reduced.columns[0], axis=1, inplace=True)
y_train_reduced.drop(columns=y_train_reduced.columns[0], axis=1, inplace=True)
x_test_reduced.drop(columns=x_test_reduced.columns[0], axis=1, inplace=True)
y_test_reduced.drop(columns=y_test_reduced.columns[0], axis=1, inplace=True)

sm_reduced = SMOTE(random_state=42)
x_res_reduced, y_res_reduced = sm_reduced.fit_resample(x_train_reduced, y_train_reduced)

In [233]:
tic = time.perf_counter()
RandomForestSmoteReduced = RandomForestClassifier(n_estimators=3, max_depth=3)
RandomForestSmoteReduced.fit(x_res_reduced, y_res_reduced.values.ravel())
toc = time.perf_counter()
print(f"Model Ran in {toc - tic:0.4f} seconds")
predictionSmoteReduced = RandomForestSmoteReduced.predict(x_test_reduced)

Model Ran in 0.0159 seconds


Precision Accuracy Recall

In [234]:
print("Accuracy:", metrics.accuracy_score(y_test_reduced, predictionSmoteReduced))
print("Recall - Macro:", metrics.recall_score(y_test_reduced, predictionSmoteReduced, average="macro"))
print("Precision - Macro:", metrics.precision_score(y_test_reduced, predictionSmoteReduced, average="macro"))
print("Recall - Micro:", metrics.recall_score(y_test_reduced, predictionSmoteReduced, average="micro"))
print("Precision - Micro:", metrics.precision_score(y_test_reduced, predictionSmoteReduced, average="micro"))

Accuracy: 0.9272030651340997
Recall - Macro: 0.9395730100158144
Precision - Macro: 0.8366060606060607
Recall - Micro: 0.9272030651340997
Precision - Micro: 0.9272030651340997
