# Supervised Learning Extension
Attempt to improve performance through creating additional features from under-represented classes.  

In [17]:
# load dependencies
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

RANDOM_SEED = 42

In [75]:
#Load PC10 data - best performing number of PC
train_file = "data/X_train_10_PCs.csv"
test_file = "data/X_test_10_PCs.csv"
ytrain_file = 'data/y_train.csv'
ytest_file = 'data/y_test.csv'

PCA_X_train = pd.read_csv(train_file)
PCA_X_train.rename(columns={"Unnamed: 0": "CellID"}, inplace = True)
PCA_X_train.set_index('CellID', inplace = True)

PCA_X_test = pd.read_csv(test_file)
PCA_X_test.rename(columns={"Unnamed: 0": "CellID"}, inplace = True)
PCA_X_test.set_index('CellID', inplace = True)

y_train = pd.read_csv(ytrain_file)
y_train.set_index('CellID', inplace = True)

y_test = pd.read_csv(ytest_file)
y_test.set_index('CellID', inplace = True)

In [76]:
# Rebalance Data Set
sm = SMOTE(random_state=RANDOM_SEED)
X_train, y_train_res = sm.fit_resample(PCA_X_train, y_train)

In [85]:
# create booleen targets for underperforming classes. ONLY training data
# underperformning cells are DC, NK, RBC, pDC
target_DC = y_train_res['type'] == 'DC'
target_DC = target_DC.astype(int)

target_NK = y_train_res['type'] == 'NK'
target_NK = target_NK.astype(int)

target_RBC = y_train_res['type'] == 'RBC'
target_RBC = target_RBC.astype(int)

target_pDC = y_train_res['type'] == 'pDC'
target_pDC = target_pDC.astype(int)

In [88]:
# try random forest 
random_forest_DC = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, target_DC)
target_DC = random_forest_DC.predict(X_train)
print("DC",np.unique(target_DC, return_counts=True))

random_forest_NK = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, target_NK)
target_NK = random_forest_NK.predict(X_train)
print("NK",np.unique(target_NK, return_counts=True))

random_forest_RBC = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, target_RBC)
target_RBC = random_forest_RBC.predict(X_train)
print("RBC",np.unique(target_RBC, return_counts=True))

random_forest_pDC = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, target_pDC)
target_pDC = random_forest_pDC.predict(X_train)
print("pDC",np.unique(target_pDC, return_counts=True))

DC (array([0, 1]), array([130182,  10014], dtype=int64))
NK (array([0, 1]), array([130182,  10014], dtype=int64))
RBC (array([0, 1]), array([130182,  10014], dtype=int64))
pDC (array([0, 1]), array([130182,  10014], dtype=int64))


In [89]:
# create dataframe with additional features for training set
augmented_train_df = X_train.copy(deep=True)
augmented_train_df['label_DC'] = target_DC
augmented_train_df['label_NK'] = target_NK
augmented_train_df['label_RBC'] = target_RBC
augmented_train_df['label_pDC'] = target_pDC

 

In [90]:
# create dataframe with additional features for test set
# labels for test set come from model trained on training set
augmented_test_df = PCA_X_test.copy(deep=True)
augmented_test_df['label_DC'] = random_forest_DC.predict(PCA_X_test)
augmented_test_df['label_NK'] = random_forest_NK.predict(PCA_X_test)
augmented_test_df['label_RBC'] = random_forest_RBC.predict(PCA_X_test)
augmented_test_df['label_pDC'] = random_forest_pDC.predict(PCA_X_test)

In [91]:
# Load function from Supervised Learning code
def generate_class_dataframe(model, model_name, input_name, X_test, y_test):
    class_report = classification_report(y_test['type'], model.predict(X_test), zero_division = 0, output_dict = True)
    df = pd.DataFrame.from_dict({key: class_report[key] for key in class_report if key not in ["support", "accuracy", "macro avg", "weighted avg"]}, orient='index')
    midx = pd.MultiIndex.from_arrays([[model_name]*len(df.columns), [input_name]*len(df.columns), df.columns])
    df.columns = midx
    return df

In [94]:
random_forest_dataframes = []
random_forest = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train_res['type'])
random_forest_dataframes.append(generate_class_dataframe(random_forest, "Random Forest", "Control", PCA_X_test, y_test))

random_forest_aug = RandomForestClassifier(random_state=RANDOM_SEED).fit(augmented_train_df, y_train_res['type'])
random_forest_dataframes.append(generate_class_dataframe(random_forest_aug, "Random Forest", "Augmented", augmented_test_df, y_test))


In [96]:
rf_df = pd.concat(random_forest_dataframes,axis =1)

In [97]:
rf_df

Unnamed: 0_level_0,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_1,Control,Control,Control,Control,Augmented,Augmented,Augmented,Augmented
Unnamed: 0_level_2,precision,recall,f1-score,support,precision,recall,f1-score,support
Alveolar,0.968085,0.98913,0.978495,92,0.978495,0.98913,0.983784,92
B cell,0.95892,0.955556,0.957235,855,0.957697,0.953216,0.955451,855
CD4 T,0.864156,0.855592,0.859853,2119,0.868533,0.863615,0.866067,2119
CD8 T,0.859082,0.859768,0.859425,2503,0.857583,0.890132,0.873554,2503
DC,0.825,0.326733,0.468085,101,0.852941,0.287129,0.42963,101
Endothelial,0.993651,0.987382,0.990506,317,0.9875,0.996845,0.992151,317
Epithelial,1.0,0.976744,0.988235,43,1.0,0.953488,0.97619,43
Mast,0.991453,0.991453,0.991453,117,0.991453,0.991453,0.991453,117
Myeloid,0.948121,0.989503,0.96837,1810,0.947257,0.992265,0.969239,1810
NK,0.333333,0.488636,0.396313,176,0.39801,0.454545,0.424403,176


In [98]:
random_forest.feature_importances_

array([0.09396182, 0.12622239, 0.09651273, 0.09890227, 0.0445779 ,
       0.0750174 , 0.1133225 , 0.13805172, 0.09427556, 0.11915572])

In [99]:
random_forest_aug.feature_importances_

array([0.05949551, 0.07102648, 0.09252953, 0.0761815 , 0.0328631 ,
       0.0450596 , 0.07629998, 0.10037082, 0.07101725, 0.08820362,
       0.06805426, 0.07126624, 0.07765466, 0.06997745])

In [100]:
rf_df.mean()

Random Forest  Control    precision      0.814175
                          recall         0.775417
                          f1-score       0.784350
                          support      720.500000
               Augmented  precision      0.836206
                          recall         0.769132
                          f1-score       0.785891
                          support      720.500000
dtype: float64