In [1]:
#Importing needed libraries
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, roc_auc_score, precision_recall_curve, classification_report

# Run Algorithm on Both Datasets 

In [2]:
# Reading the existing data into dataframes 
df_dichek = pd.read_csv('all_data_matrix_logNorm.csv', index_col=0,header=0).T
df_pedroza = pd.read_csv('pedroza_data_matrix_logNorm.csv',index_col=0,header=0).T

df_dichek_metadata = pd.read_csv('all_metadata.csv',index_col=0,header=0)
df_pedroza_metadata = pd.read_csv('pedroza_metadata.csv',index_col=0,header=0)

In [3]:
#checking shape of the dichek metdata
print(df_dichek.shape)
print(df_dichek_metadata.shape)

(15431, 20238)
(15431, 21)


In [4]:
#checking shape of the pedroza metadata and data
print(df_pedroza.shape)
print(df_pedroza_metadata.shape)

(9745, 16912)
(9745, 17)


In [5]:
#Changing indices so that they match and the two data sets can be joined together 
df_dichek.index = df_dichek.index.str.replace('.','-')
df_pedroza.index = df_pedroza.index.str.replace('.', '-')

In [7]:
intersecting_genes = df_dichek.columns.intersection(df_pedroza.columns)
X_dichek = df_dichek[intersecting_genes]
X_pedroza = df_pedroza[intersecting_genes]

In [8]:
# Split metadata into the SHF = 0, CNC = 1 
y_dichek = (df_dichek_metadata['lineage']=='CNC').astype('int')
y_pedroza  = (df_pedroza_metadata['lineage'] == 'CNC').astype('int')
print("y_dichek shape: ",y_dichek.shape)
print("y_pedroza shape: ", y_pedroza.shape)
print(y_pedroza.head(5))
print(y_dichek.head(5))

y_dichek shape:  (15431,)
y_pedroza shape:  (9745,)
pos_M_wt_AAACCCACATCTCGTC-1    0
pos_M_wt_AAACCCAGTGACACGA-1    0
pos_M_wt_AAACGAAGTATGAAGT-1    0
pos_M_wt_AAACGCTAGAGGTTTA-1    0
pos_M_wt_AAACGCTGTGGCTCTG-1    0
Name: lineage, dtype: int64
MY506pos_AAACCCATCGCCATAA-1    0
MY506pos_AAACGAAAGTGGTTGG-1    0
MY506pos_AAACGCTAGACAGTCG-1    0
MY506pos_AAAGGGCGTCGCAACC-1    0
MY506pos_AAAGTCCCAACAGTGG-1    0
Name: lineage, dtype: int64


In [9]:
# Combine the data sets 
X = pd.concat([X_dichek,X_pedroza])
y = pd.concat([y_dichek, y_pedroza])

After this, data has been properly partitioned and data will need to be used to create a random forest classifier 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [15]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

In [19]:
y_prob_new = np.empty((y_prob.shape[0], ))
for j in range(0, y_prob.shape[0]):
    y_prob_new[j] = y_prob[j][1]
# Accuracy Classification Score 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_prob_new)
classification = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC :", auc_score)
print(classification)

Accuracy: 0.903494837172359
Precision: 0.8873148744365744
Recall: 0.9529737206085753
ROC AUC : 0.9705309197012861
              precision    recall  f1-score   support

           0       0.93      0.84      0.88      2144
           1       0.89      0.95      0.92      2892

    accuracy                           0.90      5036
   macro avg       0.91      0.89      0.90      5036
weighted avg       0.91      0.90      0.90      5036



Now, as part of the baseline comparison, we will look at the different possible test set values to see when certain accuaracies are attained here 

In [11]:
X = pd.concat([X_dichek,X_pedroza])
y = pd.concat([y_dichek, y_pedroza])

for i in range (0, 14):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 + (0.05 * i))
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)

    test_size = 0.25 + (0.05 * i) 
    print("Test size:", test_size)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC AUC :", auc_score)
    print(classification)

Test size: 0.25
Accuracy: 0.9008579599618685
Precision: 0.8816523061327927
Recall: 0.9568206820682068
ROC AUC : 0.9720823803599321
              precision    recall  f1-score   support

           0       0.93      0.82      0.88      2658
           1       0.88      0.96      0.92      3636

    accuracy                           0.90      6294
   macro avg       0.91      0.89      0.90      6294
weighted avg       0.90      0.90      0.90      6294

Test size: 0.3
Accuracy: 0.8958029921885343
Precision: 0.87636597385901
Recall: 0.9511627906976744
ROC AUC : 0.9678210095868572
              precision    recall  f1-score   support

           0       0.93      0.82      0.87      3253
           1       0.88      0.95      0.91      4300

    accuracy                           0.90      7553
   macro avg       0.90      0.89      0.89      7553
weighted avg       0.90      0.90      0.89      7553

Test size: 0.35
Accuracy: 0.8936677258284158
Precision: 0.8769092542677448
Recall: 0.95

In [21]:
X = pd.concat([X_dichek,X_pedroza])
y = pd.concat([y_dichek, y_pedroza])

for i in range (1, 10):
    test_set_size = 0.9 + (0.01 * i)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size)
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)

    y_prob_new = np.empty((y_prob.shape[0], ))
    for j in range(0, y_prob.shape[0]):
        y_prob_new[j] = y_prob[j][1]

    # Accuracy Classification Score 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob_new)
    classification = classification_report(y_test, y_pred)
 
    print("Test size:", test_set_size)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC AUC :", auc_score)
    print(classification)
    

Test size: 0.91
Accuracy: 0.8313473877176902
Precision: 0.7942187698968547
Recall: 0.9517778116893026
ROC AUC : 0.9364410327651994
              precision    recall  f1-score   support

           0       0.91      0.67      0.77      9805
           1       0.79      0.95      0.87     13106

    accuracy                           0.83     22911
   macro avg       0.85      0.81      0.82     22911
weighted avg       0.84      0.83      0.83     22911

Test size: 0.92
Accuracy: 0.8288144374406355
Precision: 0.7926790785736826
Recall: 0.9486404833836858
ROC AUC : 0.9291917553594777
              precision    recall  f1-score   support

           0       0.91      0.67      0.77      9922
           1       0.79      0.95      0.86     13240

    accuracy                           0.83     23162
   macro avg       0.85      0.81      0.82     23162
weighted avg       0.84      0.83      0.82     23162

Test size: 0.93
Accuracy: 0.7959340565473648
Precision: 0.7492017416545719
Recall: 0

# Train on Dichek, test of Pedroza

In [150]:
rf_dichek_pedroza = RandomForestClassifier()
rf_dichek_pedroza.fit(X_dichek, y_dichek)

In [162]:
y_pred = rf_dichek_pedroza.predict(X_pedroza)

In [163]:
accuracy = accuracy_score(y_pedroza, y_pred)
precision = precision_score(y_pedroza, y_pred)
recall = recall_score(y_pedroza, y_pred)
auc_score = roc_auc_score(y_pedroza, y_pred)
classification = classification_report(y_pedroza, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC :", auc_score)
print(classification)

Accuracy: 0.5131862493586454
Precision: 0.6316397228637414
Recall: 0.2103846153846154
ROC AUC : 0.5350052889904375
              precision    recall  f1-score   support

           0       0.49      0.86      0.62      4545
           1       0.63      0.21      0.32      5200

    accuracy                           0.51      9745
   macro avg       0.56      0.54      0.47      9745
weighted avg       0.56      0.51      0.46      9745



# Train on Pedroza, test on Dichek 

In [164]:
rf_pedroza_dichek = RandomForestClassifier()
rf_pedroza_dichek.fit(X_pedroza, y_pedroza)

In [165]:
y_pred = rf_pedroza_dichek.predict(X_dichek)

In [167]:
accuracy = accuracy_score(y_dichek, y_pred)
precision = precision_score(y_dichek, y_pred)
recall = recall_score(y_dichek, y_pred)
auc_score = roc_auc_score(y_dichek, y_pred)
classification = classification_report(y_dichek, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC :", auc_score)
print(classification)

Accuracy: 0.42848810835331475
Precision: 0.6527886881382561
Recall: 0.09024761077324066
ROC AUC : 0.5096103874210088
              precision    recall  f1-score   support

           0       0.41      0.93      0.57      6223
           1       0.65      0.09      0.16      9208

    accuracy                           0.43     15431
   macro avg       0.53      0.51      0.36     15431
weighted avg       0.55      0.43      0.32     15431

