In [56]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import load

In [57]:
data_bs1 = pd.read_csv('../../data/preprocessed_data_BS1.csv')
data_bs1.head()

Unnamed: 0,hg38_ASF1A,hg38_CCT7,hg38_EPOR,hg38_NECAP2,hg38_LRRC8A,hg38_NDUFS8,hg38_MRPL22,hg38_HIST1H2BC,hg38_LY96,hg38_UBE2E2-AS1,...,hg38_ASPSCR1,hg38_CDC20,hg38_DNAJB9,hg38_OSTM1,hg38_LCNL1,hg38_PHB,hg38_LRRC29,hg38_ZFP36,Group,Subgroups
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory
1,0,0,0,3,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,2,0,3,B,Bmemory
3,0,0,0,0,0,3,0,0,0,0,...,0,0,3,0,0,3,0,0,B,Bmemory
4,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,B,Bmemory


In [58]:
X = data_bs1.drop(['Group', 'Subgroups'], axis=1)
y = data_bs1['Group']

In [59]:
y.unique()

array(['B', 'D', 'M', 'NK', 'T'], dtype=object)

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)

In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [62]:
def model_info(y_real, y_pred):
    print('Confusion matrix: \n', confusion_matrix(y_real, y_pred))
    print('Accuracy score: ', accuracy_score(y_real, y_pred))
    print('Precision score: ', precision_score(y_real, y_pred, average='weighted', zero_division=0))
    print('Recall score: ', recall_score(y_real, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_real, y_pred, average='weighted'))

In [63]:
def build_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    print('Train data:\n')
    y_train_pred = model.predict(X_train)
    model_info(y_train, y_train_pred)

    print('\n--------------------------------------\n')

    print('Test data:\n')
    y_pred = model.predict(X_test)
    model_info(y_test, y_pred)

    return model

In [64]:
def test_model(model, file):
    test_data = pd.read_csv(file)

    X_test_data = test_data.drop(['Group', 'Subgroups'], axis=1)
    y_test_data = test_data['Group']

    print(y_test_data.unique())

    y_pred = model.predict(X_test_data)

    model_info(y_test_data, y_pred)

In [65]:
from sklearn.ensemble import RandomForestClassifier

model_bs1 = RandomForestClassifier(n_estimators=100, random_state=55)

In [66]:
model_bs1 = build_model(model_bs1, X_train, X_test, y_train, y_test)

Train data:

Confusion matrix: 
 [[1142    0    0    0    0]
 [   0  101    0    0    0]
 [   0    0 1165    0    0]
 [   0    0    0 1002    0]
 [   0    0    0    0 5818]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 516    1    1    0    0]
 [   0   36    5    0    0]
 [   0    0  494    0    2]
 [   1    0    0  319   72]
 [   2    0    1   71 2434]]
Accuracy score:  0.9605562579013907
Precision score:  0.9604559873461571
Recall score:  0.9605562579013907
F1 score:  0.9604771148235505


In [67]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 15, 30],
}

model = RandomForestClassifier(random_state=55)
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=5, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=  10.7s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.9s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.6s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.5s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   9.4s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  17.9s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  18.4s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  18.6s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  18.3s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=  18.5s
[CV] END ...criterion=gini, max_depth=None, n_estimators=300; total time=  27.0s
[CV] END ...criterion=gini, max_depth=None, n_es

In [71]:
best_params, best_score

({'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 300},
 0.9557866272053837)

In [72]:
data_bs2 = pd.read_csv('../../data/preprocessed_data_BS2.csv')

X = data_bs2.drop(['Group', 'Subgroups'], axis=1)
y = data_bs2['Group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)

model_bs2 = build_model(best_model, X_train, X_test, y_train, y_test)

Train data:

Confusion matrix: 
 [[1286    0    0    0    0]
 [   0  190    0    0    0]
 [   0    0 1410    0    0]
 [   0    0    0  572    0]
 [   0    0    0    0 5044]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 583    0    0    0    8]
 [   3   58   15    0    4]
 [   0    0  596    0    0]
 [   0    0    0  118  152]
 [   2    0    1    0 2104]]
Accuracy score:  0.9492316136114161
Precision score:  0.95253423868226
Recall score:  0.9492316136114161
F1 score:  0.9414476953238208


In [73]:
data_geo = pd.read_csv('../../data/preprocessed_data_GEO.csv')

X = data_geo.drop(['Group', 'Subgroups'], axis=1)
y = data_geo['Group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)

model_geo = build_model(best_model, X_train, X_test, y_train, y_test)

Train data:

Confusion matrix: 
 [[1240    0    0    0]
 [   0  593    0    0]
 [   0    0  230    0]
 [   0    0    0 9420]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 457    7    0   52]
 [   0  257    0    6]
 [   0    0   35   44]
 [   1   19    0 4044]]
Accuracy score:  0.9737911418122714
Precision score:  0.9745485949638031
Recall score:  0.9737911418122714
F1 score:  0.9719458882326201


In [74]:
data_10x = pd.read_csv('../../data/preprocessed_data_10x.csv')

X = data_10x.drop(['Group', 'Subgroups'], axis=1)
y = data_10x['Group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)

model_10x = build_model(best_model, X_train, X_test, y_train, y_test)

Train data:

Confusion matrix: 
 [[ 6774     0     0     0]
 [    0  1311     0     0]
 [    0     0  5762     0]
 [    0     0     0 43852]]
Accuracy score:  1.0
Precision score:  1.0
Recall score:  1.0
F1 score:  1.0

--------------------------------------

Test data:

Confusion matrix: 
 [[ 2951     0     0     0]
 [    1   531     0     5]
 [    0     1  2399    17]
 [    3     2     2 18817]]
Accuracy score:  0.9987464110962837
Precision score:  0.9987461179057895
Recall score:  0.9987464110962837
F1 score:  0.9987448467633459


In [68]:
test_model(best_model, '../../data/preprocessed_data_BS2.csv')

['B' 'D' 'M' 'NK' 'T']
Confusion matrix: 
 [[1703    0    0    0  174]
 [ 132    0   72    0   66]
 [   9    0 1698    0  299]
 [   0    0    0  145  697]
 [   4    0    2    4 7141]]
Accuracy score:  0.8798781491849168
Precision score:  0.8700181932175319
Recall score:  0.8798781491849168
F1 score:  0.851557236803468


In [69]:
test_model(best_model, '../../data/preprocessed_data_GEO.csv')

['B' 'M' 'NK' 'T']
Confusion matrix: 
 [[ 1360   103    41   252]
 [    2   837     0    17]
 [    0     0    58   251]
 [  172   111    30 13171]]
Accuracy score:  0.940323072234075
Precision score:  0.9356486527893719
Recall score:  0.940323072234075
F1 score:  0.9360371781154433


In [70]:
test_model(best_model, '../../data/preprocessed_data_10x.csv')

['B' 'M' 'NK' 'T']
Confusion matrix: 
 [[ 9454     0     0   271]
 [   81  1659     0   108]
 [    0     0  1318  6861]
 [   37     0     1 62638]]
Accuracy score:  0.9107220847284904
Precision score:  0.919688771332379
Recall score:  0.9107220847284904
F1 score:  0.8829768578076652
