## Base Models

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
AKF_final_test = pd.read_csv('./AKF_final_test.csv.gz', compression='gzip')
AKF_final_test_KNN = pd.read_csv('./AKF_KNN_final_test.csv.gz', compression='gzip')


In [3]:
hadm_features = AKF_final_test.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51498)']
hadm_target = AKF_final_test.loc[:, 'AKF']

In [4]:
# Creatinine - 50912

hadm_features = hadm_features.loc[:, ['(\'max\', 50912)',  '(\'above_max\', 50912)', '(\'mean\', 50912)',  '(\'abn_percent\', 50912)']]
# hadm_features = AKF_final_test.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51498)']
# hadm_target = AKF_final_test.loc[:, 'AKF']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(hadm_features, hadm_target, stratify=hadm_target, random_state=0)


In [6]:
# # # Optional cell for NearMiss
# # from imblearn.under_sampling import NearMiss
# # from imblearn.over_sampling import SMOTE
# # sm = SMOTE()
# # nm = NearMiss()
# # X, y = nm.transform(hadm_features, hadm_target)
# # hadm_features, hadm_target = X, y

# # Simulate NearMiss
# train_data = pd.concat([X_train, y_train], axis=1)
# test_data = pd.concat([X_test, y_test], axis=1)

# AKF = train_data[train_data.AKF==1]
# non_AKF = train_data[train_data.AKF==0]

# non_AKF = non_AKF.sample(n=len(AKF))

# undersampled_train_data = pd.concat([AKF, non_AKF], axis=0)
# undersampled_train_data = undersampled_train_data.sample(frac=1)

# X_train = undersampled_train_data.drop('AKF', axis=1)
# y_train = undersampled_train_data['AKF']

# X_test = test_data.drop('AKF', axis=1)
# y_test = test_data['AKF']


## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [8]:
%%time

# L2
params_log = [ {'C':np.logspace(-4, 3, 20)} ]
logReg = LogisticRegression(max_iter = 2000, class_weight = "balanced")

# # L1
# params_log = [ {'C':np.logspace(-4, -1, 20)} ]
# logReg = LogisticRegression(penalty = 'l1', max_iter = 2000, class_weight = "balanced", solver = 'liblinear')

grid_log = GridSearchCV(estimator = logReg, param_grid = params_log, cv = 5)
grid_log.fit(X_train, y_train)
grid_log.best_score_

CPU times: user 5.04 s, sys: 831 ms, total: 5.87 s
Wall time: 1.14 s


0.901967213114754

In [9]:
grid_log.best_params_

{'C': 0.4832930238571752}

In [10]:
list(zip(X_train.columns, grid_log.best_estimator_.coef_[0]))


[("('max', 50912)", 5.22565703560579),
 ("('above_max', 50912)", -3.717912049584424),
 ("('mean', 50912)", -1.3959909911841113),
 ("('abn_percent', 50912)", 1.243555740041327)]

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [20]:
pred = grid_log.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Accuracy Score: 90.07%
_______________________________________________
Classification Report:	Precision Score: 61.58%
			Recall Score: 76.76%
			F1 score: 68.34%
_______________________________________________
Confusion Matrix: 
 [[807  68]
 [ 33 109]]



## Random Forest

In [13]:
from sklearn import ensemble
import sklearn.model_selection as ms

randomForest = ensemble.RandomForestClassifier(class_weight = "balanced")

In [14]:
%%time

grid_para_forest = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 20),
    'n_estimators': range(10, 110, 10)
}

grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_forest.fit(X_train, y_train)

CPU times: user 7.98 s, sys: 764 ms, total: 8.74 s
Wall time: 44.9 s


GridSearchCV(cv=5, estimator=RandomForestClassifier(class_weight='balanced'),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 20),
                         'n_estimators': range(10, 110, 10)},
             scoring='accuracy')

In [15]:
# Get most important features
tree_final = grid_search_forest.best_estimator_
feature_importance = tree_final.feature_importances_
feature_importance = list(zip(X_train.columns, feature_importance))

feature_importance.sort(key = lambda x: x[1], reverse = True)
feature_importance

[("('max', 50912)", 0.3945927536480366),
 ("('above_max', 50912)", 0.3942217221682857),
 ("('abn_percent', 50912)", 0.15528583495965484),
 ("('mean', 50912)", 0.05589968922402293)]

In [16]:
grid_search_forest.best_params_

{'criterion': 'gini', 'max_depth': 3, 'n_estimators': 30}

In [17]:
grid_search_forest.best_score_

0.9059016393442623

In [18]:
print("The training error is: %.5f" % (grid_search_forest.score(X_train, y_train)))
print("The test     error is: %.5f" % (grid_search_forest.score(X_test, y_test)))

The training error is: 0.90689
The test     error is: 0.90855


In [19]:
rf_predictions = tree_final.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, rf_predictions) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, rf_predictions) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, rf_predictions) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, rf_predictions) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, rf_predictions)}\n")

Accuracy Score: 90.86%
_______________________________________________
Classification Report:	Precision Score: 63.10%
			Recall Score: 83.10%
			F1 score: 71.73%
_______________________________________________
Confusion Matrix: 
 [[806  69]
 [ 24 118]]

