In [1]:
import datetime
from sklearn_utils.utils import SkUtilsIO,filter_by_label
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
import pandas as pd
from sklearn import preprocessing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
X, y = SkUtilsIO('BC.csv').from_csv(
    label_column='stage')
y = ['healthy' if i == 'h' else 'bc' for i in y]

In [3]:
new_y = []
for item in y:
    if item == "bc":
        new_y.append(1)
    else:
        new_y.append(0)
np_y_new = np.array(new_y)

In [4]:
X_df = pd.read_excel('new_analysis.xlsx')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(np.nan_to_num(X_df), y, test_size=0.1, random_state=42)

Logistic Regression

In [6]:
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
logreg = LogisticRegression(C=0.3e-6)
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
logreg_cv.fit(X_train,y_train)
# logreg.fit(X_train, y_train)
y_pred = logreg_cv.predict(X_test)
kf = StratifiedKFold(n_splits=10)
# scores = cross_val_score(logreg, np.nan_to_num(X_df), np_y_new, cv=kf, scoring='f1')
log_scores = cross_val_score(logreg_cv, np.nan_to_num(X_df), np_y_new, cv=kf, scoring='f1_micro')
print('kfold test: %s' % log_scores)
print('mean: %s' % log_scores.mean().round(3))
print('std: %s' % log_scores.std().round(3))

kfold test: [0.77272727 0.77272727 0.72727273 0.81818182 0.95454545 0.77272727
 0.95238095 0.85714286 0.85       0.95      ]
mean: 0.843
std: 0.081


Random Forest

In [7]:
rf = RandomForestClassifier(max_depth=4, random_state=0)
rf.fit(X_train, y_train)
kf = StratifiedKFold(n_splits=10)
rf_scores = cross_val_score(rf, np.nan_to_num(X_df), np_y_new, cv=kf, scoring='f1_micro')
print('kfold test: %s' % rf_scores)
print('mean: %s' % rf_scores.mean().round(3))
print('std: %s' % rf_scores.std().round(3))

kfold test: [0.77272727 0.72727273 0.77272727 0.81818182 0.77272727 0.77272727
 0.9047619  0.85714286 0.85       0.9       ]
mean: 0.815
std: 0.058


SVM

In [8]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
kf = StratifiedKFold(n_splits=10)
svm_scores = cross_val_score(svm, np.nan_to_num(X_df), np_y_new, cv=kf, scoring='f1_micro')
print('kfold test: %s' % svm_scores)
print('mean: %s' % svm_scores.mean().round(3))
print('std: %s' % svm_scores.std().round(3))

kfold test: [0.63636364 0.63636364 0.63636364 0.63636364 0.63636364 0.63636364
 0.66666667 0.66666667 0.65       0.65      ]
mean: 0.645
std: 0.012


In [9]:
kf = StratifiedKFold(n_splits=10, random_state=43)
logreg = LogisticRegression(C=0.3e-6)
logreg_cv_split = GridSearchCV(logreg, param_grid, cv=5)
logreg_results = []
for train_index, test_index in kf.split(np.nan_to_num(X_df), np_y_new):
        X_train, y_train= np.nan_to_num(X_df)[train_index], np_y_new[train_index]
        X_test, y_test= np.nan_to_num(X_df)[test_index], np_y_new[test_index]
        
        clf = logreg_cv_split.fit(X_train, y_train)
        # print(classification_report(clf.predict(X_test), y_test))
        logreg_results.append(classification_report(clf.predict(X_test), y_test))

In [10]:
rf_results = []
kf = StratifiedKFold(n_splits=10, random_state=43)
randomforest_split = RandomForestClassifier(max_depth=4, random_state=0)
for train_index, test_index in kf.split(np.nan_to_num(X_df), np_y_new):
        X_train, y_train= np.nan_to_num(X_df)[train_index], np_y_new[train_index]
        X_test, y_test= np.nan_to_num(X_df)[test_index], np_y_new[test_index]
        
        clf = randomforest_split.fit(X_train, y_train)
        # print(classification_report(clf.predict(X_test), y_test))
        rf_results.append(classification_report(clf.predict(X_test), y_test))

In [11]:
svm_results = []
kf = StratifiedKFold(n_splits=10, random_state=43)
svm_split = SVC(gamma='auto')
for train_index, test_index in kf.split(np.nan_to_num(X_df), np_y_new):
        X_train, y_train= np.nan_to_num(X_df)[train_index], np_y_new[train_index]
        X_test, y_test= np.nan_to_num(X_df)[test_index], np_y_new[test_index]
        
        clf = svm_split.fit(X_train, y_train)
        # print(classification_report(clf.predict(X_test), y_test))
        svm_results.append(classification_report(clf.predict(X_test), y_test))

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


# Logistic Regression Fold Based Scores

In [12]:
for item in logreg_results:
    print(item)

              precision    recall  f1-score   support

           0       0.50      0.80      0.62         5
           1       0.93      0.76      0.84        17

    accuracy                           0.77        22
   macro avg       0.71      0.78      0.73        22
weighted avg       0.83      0.77      0.79        22

              precision    recall  f1-score   support

           0       0.38      1.00      0.55         3
           1       1.00      0.74      0.85        19

    accuracy                           0.77        22
   macro avg       0.69      0.87      0.70        22
weighted avg       0.91      0.77      0.81        22

              precision    recall  f1-score   support

           0       0.25      1.00      0.40         2
           1       1.00      0.70      0.82        20

    accuracy                           0.73        22
   macro avg       0.62      0.85      0.61        22
weighted avg       0.93      0.73      0.79        22

              preci

# Random Forest Fold Based Scores

In [13]:
for item in rf_results:
    print(item)

              precision    recall  f1-score   support

           0       0.50      0.80      0.62         5
           1       0.93      0.76      0.84        17

    accuracy                           0.77        22
   macro avg       0.71      0.78      0.73        22
weighted avg       0.83      0.77      0.79        22

              precision    recall  f1-score   support

           0       0.25      1.00      0.40         2
           1       1.00      0.70      0.82        20

    accuracy                           0.73        22
   macro avg       0.62      0.85      0.61        22
weighted avg       0.93      0.73      0.79        22

              precision    recall  f1-score   support

           0       0.38      1.00      0.55         3
           1       1.00      0.74      0.85        19

    accuracy                           0.77        22
   macro avg       0.69      0.87      0.70        22
weighted avg       0.91      0.77      0.81        22

              preci

# SVM Fold Based Scores

In [14]:
for item in svm_results:
    print(item)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.64      0.78        22

    accuracy                           0.64        22
   macro avg       0.50      0.32      0.39        22
weighted avg       1.00      0.64      0.78        22

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.64      0.78        22

    accuracy                           0.64        22
   macro avg       0.50      0.32      0.39        22
weighted avg       1.00      0.64      0.78        22

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.64      0.78        22

    accuracy                           0.64        22
   macro avg       0.50      0.32      0.39        22
weighted avg       1.00      0.64      0.78        22

              preci

# GENERAL OVERVIEW

In [15]:
df = pd.DataFrame({'classifiers': ['logreg', 'rf', 'svm'], 'mean': [log_scores.mean().round(3), rf_scores.mean().round(3),svm_scores.mean().round(3)]})
ax = df.plot.bar(x='classifiers', y='mean', rot=0)
df.sort_values('mean', ascending=False)
# df.to_excel('new_test.xlsx')

Unnamed: 0,classifiers,mean
0,logreg,0.843
1,rf,0.815
2,svm,0.645
