In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import load
import gc

In [2]:
data_bs1 = pd.read_csv('../data/preprocessed_data_BS1.csv')
data_bs1.head()

Unnamed: 0,hg38_ASF1A,hg38_CCT7,hg38_EPOR,hg38_NECAP2,hg38_LRRC8A,hg38_NDUFS8,hg38_MRPL22,hg38_HIST1H2BC,hg38_LY96,hg38_UBE2E2-AS1,...,hg38_TMEM128,hg38_ASPSCR1,hg38_CDC20,hg38_DNAJB9,hg38_OSTM1,hg38_LCNL1,hg38_PHB,hg38_LRRC29,hg38_ZFP36,Group
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B
1,0,0,0,3,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,2,0,3,B
3,0,0,0,0,0,3,0,0,0,0,...,0,0,0,3,0,0,3,0,0,B
4,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B


In [3]:
data_bs2 = pd.read_csv('../data/preprocessed_data_BS2.csv')
data_geo = pd.read_csv('../data/preprocessed_data_GEO.csv')
data_10x = pd.read_csv('../data/preprocessed_data_10x.csv')

In [4]:
X = data_bs1.drop(['Group'], axis=1)
y = data_bs1['Group']

In [5]:
def replace_values(y):
    return y.replace({'B': 0, 'D': 1, 'M': 2, 'NK': 3, 'T': 4}, regex=True)

In [6]:
encoded_y = replace_values(y)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.3, random_state=55)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [9]:
def model_info(y_real, y_pred, labels=['B', 'D', 'M', 'NK', 'T']):
    cm = confusion_matrix(y_real, y_pred)
    class_sums = cm.sum(axis=1)
    normalized_conf_matrix = cm / class_sums[:, np.newaxis]
    sns.heatmap(normalized_conf_matrix, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels, fmt='.2f')
    plt.xlabel('Predvidjene vrednosti')
    plt.ylabel('Stvarne vrednosti')
    plt.title('Matrica konfuzije')
    plt.show()

    print('Matrica konfuzije: \n', cm)
    print('Accuracy score: ', accuracy_score(y_real, y_pred))
    print('Precision score: ', precision_score(y_real, y_pred, average='weighted', zero_division=0))
    print('Recall score: ', recall_score(y_real, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_real, y_pred, average='weighted'))

In [10]:
def build_model(model, X_train, X_test, y_train, y_test, classes=['B', 'D', 'M', 'NK', 'T']):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    model_info(y_test, y_pred, classes)

    return model

In [11]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [13]:
best_params_rf = {
    'max_depth': 15,
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 300
}

best_params_xgb = {
    'learning_rate': 0.2,
    'max_depth': 4,
    'n_estimators': 300
}

best_params_lr = {
    'C': 0.001,
    'penalty': 'l2',
    'solver': 'sag',
    'class_weight': 'balanced'
}

best_params_mlp = {
    'alpha': 1e-05,
    'batch_size': 32,
    'hidden_layer_sizes': (50,),
    'learning_rate_init': 0.001,
    'max_iter': 1000
}

In [14]:
model_rf = RandomForestClassifier(**best_params_rf, random_state=55)
model_xgb = XGBClassifier(**best_params_xgb, random_state=55)
model_lr = LogisticRegression(**best_params_lr, random_state=55)
model_mlp = MLPClassifier(**best_params_mlp, random_state=55)

In [15]:
model_rf.fit(X_train, y_train)

In [16]:
model_xgb.fit(X_train, y_train)

In [17]:
model_lr.fit(X_train, y_train)



In [18]:
model_mlp.fit(X_train, y_train)