In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import load

In [2]:
data_bs1 = pd.read_csv('../../data/preprocessed_data_BS1.csv')
data_bs1.head()

Unnamed: 0,hg38_ASF1A,hg38_CCT7,hg38_EPOR,hg38_NECAP2,hg38_LRRC8A,hg38_NDUFS8,hg38_MRPL22,hg38_HIST1H2BC,hg38_LY96,hg38_UBE2E2-AS1,...,hg38_ASPSCR1,hg38_CDC20,hg38_DNAJB9,hg38_OSTM1,hg38_LCNL1,hg38_PHB,hg38_LRRC29,hg38_ZFP36,Group,Subgroups
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory
1,0,0,0,3,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,2,0,3,B,Bmemory
3,0,0,0,0,0,3,0,0,0,0,...,0,0,3,0,0,3,0,0,B,Bmemory
4,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory


In [3]:
X = data_bs1.drop(['Group', 'Subgroups'], axis=1)
y = data_bs1['Group']

In [4]:
y.unique()

array(['B', 'D', 'M', 'NK', 'T'], dtype=object)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [7]:
def model_info(y_real, y_pred):
    print('Confusion matrix: \n', confusion_matrix(y_real, y_pred))
    print('Accuracy score: ', accuracy_score(y_real, y_pred))
    print('Precision score: ', precision_score(y_real, y_pred, average='weighted', zero_division=0))
    print('Recall score: ', recall_score(y_real, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_real, y_pred, average='weighted'))

In [8]:
def build_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    print('Train data:\n')
    y_train_pred = model.predict(X_train)
    model_info(y_train, y_train_pred)

    print('\n--------------------------------------\n')

    print('Test data:\n')
    y_pred = model.predict(X_test)
    model_info(y_test, y_pred)

    return model

In [9]:
from sklearn.ensemble import RandomForestClassifier

model_bs1 = RandomForestClassifier(n_estimators=100, random_state=55)

In [10]:
model_bs1 = build_model(model_bs1, X_train, X_test, y_train, y_test)

Train data:

Confusion matrix: 
 [[1142    0    0    0    0]
 [   0  101    0    0    0]
 [   0    0 1165    0    0]
 [   0    0    0 1002    0]
 [   0    0    0    0 5818]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 516    1    1    0    0]
 [   0   36    5    0    0]
 [   0    0  494    0    2]
 [   1    0    0  319   72]
 [   2    0    1   71 2434]]
Accuracy score:  0.9605562579013907
Precision score:  0.9604559873461571
Recall score:  0.9605562579013907
F1 score:  0.9604771148235505


In [11]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 15, 30],
}

model = RandomForestClassifier(random_state=55)
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=5, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.4s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.1s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   8.9s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.4s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.3s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.3s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.3s
[CV] END ...criterion=gini, max_depth=None, n_estimators=300; total time=  25.4s
[CV] END ...criterion=gini, max_depth=None, n_es

In [12]:
best_params, best_score

({'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 300},
 0.9557866272053837)

In [14]:
data_bs2 = pd.read_csv('../../data/preprocessed_data_BS2.csv')

X = data_bs2.drop(['Group', 'Subgroups'], axis=1)
y = data_bs2['Group']

X_train_bs2, X_test_bs2, y_train_bs2, y_test_bs2 = train_test_split(X, y, test_size=0.3, random_state=55)

model_bs2 = build_model(best_model, X_train_bs2, X_test_bs2, y_train_bs2, y_test_bs2)
print('\n--------------------------------------\n')

X_test_data = data_bs2.drop(['Group', 'Subgroups'], axis=1)
y_test_data = data_bs2['Group']

test_model = best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test_data)
model_info(y_test_data, y_pred)

Train data:

Confusion matrix: 
 [[1286    0    0    0    0]
 [   0  190    0    0    0]
 [   0    0 1410    0    0]
 [   0    0    0  572    0]
 [   0    0    0    0 5044]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 583    0    0    0    8]
 [   3   60   14    0    3]
 [   0    0  596    0    0]
 [   0    0    0  118  152]
 [   1    0    1    0 2105]]
Accuracy score:  0.9500548847420417
Precision score:  0.9533238142663613
Recall score:  0.9500548847420417
F1 score:  0.9423455174196795

--------------------------------------

Confusion matrix: 
 [[1528    0    0    0  349]
 [ 146    0   34    0   90]
 [  17    0 1311    0  678]
 [   0    0    0   30  812]
 [   1    0    0    0 7150]]
Accuracy score:  0.8248806191338712
Precision score:  0.8335252932043407
Recall score:  0.8248806191338712
F1 score:  0.7850630151613864


In [15]:
data_geo = pd.read_csv('../../data/preprocessed_data_GEO.csv')

X = data_geo.drop(['Group', 'Subgroups'], axis=1)
y = data_geo['Group']

X_train_geo, X_test_geo, y_train_geo, y_test_geo = train_test_split(X, y, test_size=0.3, random_state=55)

model_geo = build_model(best_model, X_train_geo, X_test_geo, y_train_geo, y_test_geo)
print('\n--------------------------------------\n')

X_test_data = data_geo.drop(['Group', 'Subgroups'], axis=1)
y_test_data = data_geo['Group']

test_model = best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test_data)
model_info(y_test_data, y_pred)

Train data:

Confusion matrix: 
 [[1240    0    0    0]
 [   0  593    0    0]
 [   0    0  230    0]
 [   0    0    0 9420]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 455    7    0   54]
 [   0  258    0    5]
 [   0    0   33   46]
 [   1   18    0 4045]]
Accuracy score:  0.97338480292564
Precision score:  0.9741591038438411
Recall score:  0.97338480292564
F1 score:  0.9713203123431974

--------------------------------------

Confusion matrix: 
 [[ 1359   104    27   266]
 [    3   837     0    16]
 [    0     0    38   271]
 [  170   111    24 13179]]
Accuracy score:  0.9395306309052118
Precision score:  0.9333545725417481
Recall score:  0.9395306309052118
F1 score:  0.9338931426840497


In [16]:
data_10x = pd.read_csv('../../data/preprocessed_data_10x.csv')

X = data_10x.drop(['Group', 'Subgroups'], axis=1)
y = data_10x['Group']

X_train_10x, X_test_10x, y_train_10x, y_test_10x = train_test_split(X, y, test_size=0.3, random_state=55)

model_10x = build_model(best_model, X_train_10x, X_test_10x, y_train_10x, y_test_10x)
print('\n--------------------------------------\n')

X_test_data = data_10x.drop(['Group', 'Subgroups'], axis=1)
y_test_data = data_10x['Group']

test_model = best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test_data)
model_info(y_test_data, y_pred)

Train data:

Confusion matrix: 
 [[ 6774     0     0     0]
 [    0  1311     0     0]
 [    0     0  5762     0]
 [    0     0     0 43852]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 2951     0     0     0]
 [    2   530     1     4]
 [    0     1  2401    15]
 [    3     2     2 18817]]
Accuracy score:  0.9987868494480165
Precision score:  0.9987861144737994
Recall score:  0.9987868494480165
F1 score:  0.9987853107597882

--------------------------------------

Confusion matrix: 
 [[ 9234     0     0   491]
 [   77  1644     0   127]
 [    0     0  1458  6721]
 [   29     1     1 62645]]
Accuracy score:  0.9096544863395933
Precision score:  0.9188415112804965
Recall score:  0.9096544863395933
F1 score:  0.8835418936348355
