# BYU BRG CAMDA CMAP Challenge

In [2]:
import numpy as np
import pandas as pd
import sys, gzip
import copy

## Import sklearn modules
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold, ShuffleSplit, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif

## Import sklearn modules for classifiers
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB

## Setting File locations
orig_scan_mcf7_train = 'orig_scan/trainingReformatedCamda_MCF7.txt.gz'
orig_scan_mcf7_test = 'orig_scan/testReformatedCamda_MCF7.txt.gz'
orig_scan_pc3_train = 'orig_scan/trainingReformatedCamda_PC3.txt.gz'
orig_scan_pc3_test = 'orig_scan/testReformatedCamda_PC3.txt.gz'
scan_mcf7_train = 'scan/scan_mcf7_train.txt.gz'
scan_mcf7_test = 'scan/scan_mcf7_test.txt.gz'
scan_pc3_train = 'scan/scan_pc3_train.txt.gz'
scan_pc3_test = 'scan/scan_pc3_test.txt.gz'
farms_mcf7_train = 'farms/farms_mcf7_train.txt.gz'
farms_mcf7_test = 'farms/farms_mcf7_test.txt.gz'
farms_pc3_train = 'farms/farms_pc3_train.txt.gz'
farms_pc3_test = 'farms/farms_pc3_test.txt.gz'

## Helper Functions

In [3]:
### Calculate Accuracy
from math import sqrt

def getAccuracy(truePositives, trueNegatives, falsePostives, falseNegatives):
    return((truePositives + trueNegatives) / float(truePositives + trueNegatives + falseNegatives + falsePostives))

def getSensitivity(truePositives, falseNegatives):
    return(truePositives/float(truePositives + falseNegatives))

def getSpecificity(trueNegatives, falsePostives):
    return(trueNegatives/float(trueNegatives + falsePostives))

def getMCC(truePositives, trueNegatives, falsePostives, falseNegatives):
    return((truePositives * trueNegatives - falsePostives * falseNegatives) / sqrt((truePositives + falsePostives) * (truePositives + falseNegatives) * (trueNegatives + falsePostives) * (trueNegatives + falseNegatives)))

def printConfusionCalculations(TP, TN, FP, FN) : 
    print("accuracy: " + str(getAccuracy(TP, TN, FP, FN)))
    print("sensitivity: " + str(getSensitivity(TP, FN)))
    print("specificity: " + str(getSpecificity(TN, FP)))
    print("MCC: " + str(getMCC(TP, TN, FP, FN)))

def getConfusionInformation(TP, TN, FP, FN) :
    return [getAccuracy(TP, TN, FP, FN), getSensitivity(TP, FN), getSpecificity(TN, FP), getMCC(TP, TN, FP, FN)]


In [12]:
def make_predictions(X_train, X_test, y_train, classifier) :
    scaler = StandardScaler()
    classifier = make_classifier(classifier)
    robust_scaler = RobustScaler(quantile_range=(25, 75))
    pca = PCA(n_components = 10)
    selected_percentile = SelectPercentile(f_classif, percentile=20)
        
    pipe = Pipeline(steps=[
#         ('s_scaler', scaler),
#         ('robust_scaler', robust_scaler),
#         ('pca', pca),
#         ('selected_percentile', selected_percentile),
        ('classifier', classifier)
    ])
    pipe.fit(X_train,y_train)
    
    predictions = pipe.predict(X_test)
    y_prob = None #pipe.predict_proba(X_test)
    
    return predictions, y_prob

In [5]:
def print_results(y_test_final, predictions_final, y_prob_final):
    TN, FP, FN, TP = confusion_matrix(y_test_final,predictions_final).ravel()
    matrix = confusion_matrix(y_test_final,predictions_final)

    print("\nConfusion Matrix -",
          "   True Negative = zeros that were calculated correctly",
          "   False Negative = zeros that were calculated incorrectly",
          "   True Positive = ones that were calculated correctly",
          "   False Positive = ones that were calculated incorrectly",
          "\n[[True Negative,False Negative]",
          "[False Positive,True Positive]]\n",
          matrix,
          "\n",
          classification_report(y_test_final,predictions_final),
          sep='\n')
    printConfusionCalculations(TP, TN, FP, FN),

def get_Results(y_test, predictions):
    TN, FP, FN, TP = confusion_matrix(y_test,predictions).ravel()
    return getConfusionInformation(TP, TN, FP, FN)
    

## Test/Train

In [14]:
def train(trainFile, classifier):
    with gzip.open(trainFile, 'r') as file :
        data = np.genfromtxt(file, delimiter='\t',dtype=str)

    ## Split the data up into features and answers
    answers = []
    features = []
    for row in data[1:,]:
        answers.append(row[1])
        features.append(row[2:])

    ## Convert to numpy arrays for algorithms
    features = np.array(features,dtype=float)
    answers = np.array(answers,dtype=float)
    
    ## Initialize prediction arrays
    y_test_final = np.array([])
    predictions_final = np.array([])
    y_prob_final = np.ndarray(shape=(0,2), dtype=int)

    ## We are using stradified fold cross validation.
    skf = StratifiedKFold(n_splits=10)
    i = 0

    ## Feature Selection needs to happen on each fold independently
    for train, test in skf.split(features, answers) :
        i += 1
        X_train, X_test, y_train, y_test = features[train], features[test], answers[train], answers[test]

        predictions, y_prob = make_predictions(X_train,X_test,y_train, classifier)

        y_test_final = np.concatenate([y_test_final,y_test])
        predictions_final = np.concatenate([predictions_final,predictions])
#         y_prob_final = np.concatenate([y_prob_final,y_prob])

    return y_test_final, predictions_final, y_prob_final
    


In [7]:
def test(trainFile, testFile, classifier):
    with gzip.open(trainFile, 'r') as file :
        trainData = np.genfromtxt(file, delimiter='\t',dtype=str)
    with gzip.open(testFile, 'r') as file :
        testData = np.genfromtxt(file, delimiter='\t',dtype=str)

    ## training data
    y_train = []
    X_train = []
    for row in trainData[1:,]:
        y_train.append(row[1])
        X_train.append(row[2:])

    X_test = []
    y_test = []
    for row in testData[1:,]:
        y_test.append(row[1])
        X_test.append(row[2:])

    y_train = np.array(y_train,dtype=float)
    y_test = np.array(y_test,dtype=float)
    X_train = np.array(X_train,dtype=float)
    X_test = np.array(X_test,dtype=float)

    ## Convert to numpy arrays for algorithms
    X_test = np.array(X_test,dtype=float)
    X_train = np.array(X_train,dtype=float)

    predictions, y_prob = make_predictions(X_train,X_test,y_train, classifier)

    return y_test, predictions, y_prob

## Classifiers

In [19]:
def _svm():
#    return svm.SVC(probability=True) #rbf #kernel='linear', probability=True, class_weight="balanced"
    return svm.SVC(kernel='linear', probability=True, class_weight={0:1,1:2})
#     return svm.NuSVC(kernel='linear', probability=True, class_weight="balanced") #rbf
    
def _rf():
    return RandomForestClassifier(n_estimators=25,
                                max_depth=9,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0,
                                class_weight={0:1,1:2})
#                                class_weight="balanced"


def _nb():
#     return BernoulliNB()
    return GaussianNB()

def _mlp():
#     return MLPClassifier(hidden_layer_sizes=(150,150,150, 150, 150), learning_rate_init = .01)
#     return MLPClassifier(hidden_layer_sizes=(200,200,200), learning_rate_init = .01)
#    return MLPClassifier(hidden_layer_sizes=(100,100,100), learning_rate_init = .01)
#    return MLPClassifier(hidden_layer_sizes=(90,80,70,60), learning_rate_init = .01)
#     return MLPClassifier(hidden_layer_sizes=(100,70,60,30,60,70,100), learning_rate_init = .002)
#     return MLPClassifier(hidden_layer_sizes=(100,70,60,30), learning_rate_init = .002)
#     return MLPClassifier(hidden_layer_sizes=(150,100,50,30), learning_rate_init = .001)
#     return MLPClassifier(hidden_layer_sizes=(150,120,90,60,30), learning_rate_init = .001)
#     return MLPClassifier(hidden_layer_sizes=(150,130,110,90,70,50), learning_rate_init = .001)
#     return MLPClassifier(hidden_layer_sizes=(150,130,110,90,75,60,40), learning_rate_init = .001)
#     return MLPClassifier(hidden_layer_sizes=(250,200,150,130,110,90), learning_rate_init = .0003)
#     return MLPClassifier(hidden_layer_sizes=(300,250,200,170,140,110,90,50), learning_rate_init = .0003)
#     return MLPClassifier(hidden_layer_sizes=(500,400,300,250,200,160,120,90,50), learning_rate_init = .0006)
#    return MLPClassifier(hidden_layer_sizes=(80,80,80), learning_rate_init = .01)
    return MLPClassifier(hidden_layer_sizes=(30,30,30,30,30,30,30,30,30,30), learning_rate_init = .0376)

def _lr():
    return linear_model.LogisticRegression(solver='lbfgs', class_weight={0:1,1:2})

def _knn():
    return KNeighborsClassifier(n_neighbors=8, weights='distance')

def _gb():
    return GradientBoostingClassifier(learning_rate = .31, max_depth = 3)

def _ensemble():
    estimators = [
        ('LR', _lr()),
        ('SVM', _svm()),
        ('KNN', _knn()),
        ('MLP', _mlp()),
        ('RF', _rf()),
        ('GB', _gb()),
#         ('NB', _nb())
    ]
#     estimators = scaled_ensemble()
    
    return VotingClassifier(estimators, voting='hard')

def make_classifier(classifier) :
    if classifier == 'mlp':
        return _mlp()
    elif classifier == 'rf':
        return _rf()
    elif classifier == 'nb':
        return _nb()
    elif classifier == 'knn':
        return _knn()
    elif classifier == 'svm':
        return _svm()
    elif classifier == 'lr':
        return _lr()
    elif classifier == 'gb':
        return _gb()
    elif classifier == 'ensemble':
        return _ensemble()
    else:
        raise ValueError('Not a correct key for classifier')

In [9]:
def scaled_ensemble():
    return [
        ('MLP_500', MLPClassifier(hidden_layer_sizes=(500,400,300,250,200,160,120,90,50), learning_rate_init = .0006)),
        ('MLP_300', MLPClassifier(hidden_layer_sizes=(300,250,200,170,140,110,90,50), learning_rate_init = .0003)),
        ('MLP_250', MLPClassifier(hidden_layer_sizes=(250,200,150,130,110,90), learning_rate_init = .0003)),
        ('MLP_150', MLPClassifier(hidden_layer_sizes=(150,100,50,30), learning_rate_init = .001)),
        ('MLP_30', MLPClassifier(hidden_layer_sizes=(30,30,30,30,30,30,30,30,30,30), learning_rate_init = .0376)),
        ('svm_orig', svm.SVC(probability=True)),
        ('svm_linear', svm.SVC(kernel='linear', probability=True, class_weight={0:25,1:1})),
        ('svm_nusvc_linear', svm.NuSVC(kernel='linear', probability=True, class_weight="balanced")),
        ('svm_nusvc_rbf', svm.NuSVC(kernel='rbf', probability=True, class_weight="balanced")),
        ('logreg_25_1', linear_model.LogisticRegression(solver='lbfgs', class_weight={0:25,1:1})),
        ('logreg_blcd', linear_model.LogisticRegression(solver='lbfgs', class_weight="balanced")),
        ('rf_norm', RandomForestClassifier(n_estimators=25,
                                max_depth=9,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0)),
        ('rf_blcd_25_9', RandomForestClassifier(n_estimators=25,
                                max_depth=9,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0)),
        ('rf_blcd_100_9', RandomForestClassifier(n_estimators=100,
                                max_depth=9,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0,
                                class_weight="balanced")),
        ('rf_blcd_100_15', RandomForestClassifier(n_estimators=100,
                                max_depth=15,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0,
                                class_weight="balanced")),
        ('rf_blcd_150_15', RandomForestClassifier(n_estimators=50,
                                max_depth=15,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0,
                                max_leaf_nodes=25,
                                bootstrap=False,
                                random_state=0,
                                class_weight="balanced")),
        ('knn_8', KNeighborsClassifier(n_neighbors=8, weights='distance')),
        ('knn_12', KNeighborsClassifier(n_neighbors=12, weights='distance')),
        ('knn_10', KNeighborsClassifier(n_neighbors=10, weights='distance')),
        ('gb_31_3', GradientBoostingClassifier(learning_rate = .31, max_depth = 3)),
        ('gb_15_5', GradientBoostingClassifier(learning_rate = .15, max_depth = 5)),
        ('gb_07_6', GradientBoostingClassifier(learning_rate = .07, max_depth = 6))
    ]

## Testing

In [20]:
classifiers = [
#     'rf',
#     'svm',
#     'nb',
#     'mlp',
#     'knn',
#     'lr',
#     'gb',
    'ensemble'
]
train_files = [
    [orig_scan_pc3_train, 'Training', 'PC3', 'orig_scan'],
    [orig_scan_mcf7_train, 'Training', 'MCF7', 'orig_scan'],
#     [scan_pc3_train, 'Training', 'PC3', 'updated_scan'],
#     [scan_mcf7_train, 'Training', 'MCF7', 'updated_scan'],
#     [farms_pc3_train, 'Training', 'PC3', 'farms'],
#     [farms_mcf7_train, 'Training', 'MCF7', 'farms']
]
test_files = [
    [orig_scan_pc3_train, orig_scan_pc3_test, 'Test', 'PC3', 'orig_scan'],
    [orig_scan_mcf7_train, orig_scan_mcf7_test, 'Test', 'MCF7', 'orig_scan'],
#     [scan_pc3_train, scan_pc3_test, 'Test', 'PC3', 'updated_scan'],
#     [scan_mcf7_train, scan_mcf7_test, 'Test', 'MCF7', 'updated_scan'],
#     [farms_pc3_train, farms_pc3_test, 'Test', 'PC3', 'farms'],
#     [farms_mcf7_train, farms_mcf7_test, 'Test', 'MCF7', 'farms'],
]

col_names =  ['Method', 'Test/Train', 'Cell Line', 'Data_Version', 'Accuracy', 'Sensitivity', 'Specificity', 'MCC']
df  = pd.DataFrame(columns = col_names)

i = 0
for classifier in classifiers:
    for file_set in train_files:
        i += 1
        print("Iteration {}: Training {} with {}".format(i, file_set[0], classifier))
        y_test, predictions, _ = train(file_set[0],classifier)
        df.loc[i] = [classifier, file_set[1], file_set[2], file_set[3]] + get_Results(y_test, predictions)
        
for classifier in classifiers:
    for file_set in test_files:
        i += 1
        print("Iteration {}: Testing {} with {}".format(i, file_set[1], classifier))
        y_test, predictions, _ = test(file_set[0], file_set[1], classifier)
        df.loc[i] = [classifier, file_set[2], file_set[3], file_set[4]] + get_Results(y_test, predictions)
        
print('done')
df


Iteration 1: Training orig_scan/trainingReformatedCamda_PC3.txt.gz with ensemble
Iteration 2: Training orig_scan/trainingReformatedCamda_MCF7.txt.gz with ensemble
Iteration 3: Testing orig_scan/testReformatedCamda_PC3.txt.gz with ensemble
Iteration 4: Testing orig_scan/testReformatedCamda_MCF7.txt.gz with ensemble
done


Unnamed: 0,Method,Test/Train,Cell Line,Data_Version,Accuracy,Sensitivity,Specificity,MCC
1,ensemble,Training,PC3,orig_scan,0.647368,0.9,0.1,0.0
2,ensemble,Training,MCF7,orig_scan,0.668421,0.9,0.166667,0.095002
3,ensemble,Test,PC3,orig_scan,0.744186,0.940299,0.052632,-0.012534
4,ensemble,Test,MCF7,orig_scan,0.709302,0.865672,0.157895,0.028216


In [21]:
# original_df = df
# original_df.to_csv('original.csv')

# updated_svm_df = df
# updated_svm_df.to_csv('updated_svm.csv')
# original_df

# robust_scaler_df = df
# robust_scaler_df.to_csv('robust_scaler.csv')

# orig_anova_df = df
# orig_anova_df.to_csv('orig_anova.csv')

# orig_pca_df = df
# orig_pca_df.to_csv('orig_pca.csv')

# balanced_df = df
# balanced_df.to_csv('balanced.csv')

# weight_1_50_df = df
# weight_1_50_df.to_csv('weight_1_50.csv')

# weight_50_1_df = df
# weight_50_1_df.to_csv('weight_50_1.csv')

# weight_25_1_df = df
# weight_25_1_df.to_csv('weight_25_1.csv')

# weight_10_1_df = df
# weight_10_1_df.to_csv('weight_10_1.csv')

# weight_5_1_df = df
# weight_5_1_df.to_csv('weight_5_1.csv')

# weight_2_1_df = df
# weight_2_1_df.to_csv('weight_2_1.csv')

# weight_1_2_df = df
# weight_1_2_df.to_csv('weight_1_2.csv')

# scaled_ensemble_df = df
# scaled_ensemble_df.to_csv('scaled_ensemble.csv')

# scaled_ensemble_hard_df = df
# scaled_ensemble_hard_df.to_csv('scaled_ensemble_hard.csv')

norm_ensemble_hard_df = df
norm_ensemble_hard_df.to_csv('norm_ensemble_hard.csv')

# original_df = pd.read_csv('original.csv')
# updated_svm_df = pd.read_csv('updated_svm.csv')
# robust_scaler_df = pd.read_csv('robust_scaler.csv')
# orig_anova_df = pd.read_csv('orig_anova.csv')
# orig_pca_df = pd.read_csv('orig_pca.csv')
# scaled_ensemble_df = pd.read_csv('scaled_ensemble.csv')

In [22]:
# origClassifiers_farms = original_df[original_df['Data_Version'] == 'farms']
# origClassifiers_orig_scan = original_df[original_df['Data_Version'] == 'orig_scan']
# origClassifiers_updated_scan = original_df[original_df['Data_Version'] == 'updated_scan']

# limited_orig_scan = origClassifiers_orig_scan.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_orig_scan.index = range(1,33)

# limited_farms = origClassifiers_farms.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_farms.index = range(1,33)

# limited_updated_scan = origClassifiers_updated_scan.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_updated_scan.index = range(1,33)

# limited_robust_scaler_df = robust_scaler_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_robust_scaler_df.index = range(1,33)

# limited_orig_anova_df = orig_anova_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_orig_anova_df.index = range(1,33)

# limited_orig_pca_df = orig_pca_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_orig_pca_df.index = range(1,33)

# limited_balanced_df = balanced_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_balanced_df.index = range(1,33)

# # limited_weight_1_50_df = weight_1_50_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# # limited_weight_1_50_df.index = range(1,33)

# limited_weight_50_1_df = weight_50_1_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_50_1_df.index = range(1,33)

# limited_weight_25_1_df = weight_25_1_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_25_1_df.index = range(1,33)

# limited_weight_10_1_df = weight_10_1_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_10_1_df.index = range(1,33)

# limited_weight_5_1_df = weight_5_1_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_5_1_df.index = range(1,33)

# limited_weight_2_1_df = weight_2_1_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_2_1_df.index = range(1,33)

# limited_weight_1_2_df = weight_1_2_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_weight_1_2_df.index = range(1,33)

# limited_scaled_ensemble_df = scaled_ensemble_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]

# limited_scaled_ensemble_hard_df = scaled_ensemble_hard_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]

limited_norm_ensemble_hard_df = norm_ensemble_hard_df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]

# limited_df = df.loc[:,['Accuracy', 'Sensitivity', 'Specificity', 'MCC']]
# limited_df.index = range(1,33)

In [304]:
col_names =  ['Accuracy', 'Sensitivity', 'Specificity', 'MCC']
data_preprocessing  = pd.DataFrame(columns = col_names)

data_preprocessing.loc['farms'] = limited_farms.sub(limited_orig_scan).mean()
data_preprocessing.loc['updated_scan'] = limited_updated_scan.sub(limited_orig_scan).mean()

data_preprocessing
#limited_farms.sub(limited_updated_scan).mean()

Unnamed: 0,Accuracy,Sensitivity,Specificity,MCC
farms,0.004682,0.011291,-0.027741,-0.017493
updated_scan,0.017044,0.036101,-0.042105,-0.008123


In [305]:
col_names =  ['Accuracy', 'Sensitivity', 'Specificity', 'MCC']
feature_selection  = pd.DataFrame(columns = col_names)

feature_selection.loc['robust_scaler'] = limited_robust_scaler_df.sub(limited_orig_scan).mean()
feature_selection.loc['anova_f_stat'] = limited_orig_anova_df.sub(limited_orig_scan).mean()
feature_selection.loc['pca'] = limited_orig_pca_df.sub(limited_orig_scan).mean()

feature_selection

Unnamed: 0,Accuracy,Sensitivity,Specificity,MCC
robust_scaler,-0.015338,-0.0429,0.047122,0.0009
anova_f_stat,-0.016547,-0.048777,0.064611,0.02145
pca,0.001771,-0.002777,0.00403,-0.008689


In [360]:
col_names =  ['Accuracy', 'Sensitivity', 'Specificity', 'MCC']
class_weights  = pd.DataFrame(columns = col_names)

class_weights.loc['weight_50_1'] = limited_weight_50_1_df.sub(limited_orig_scan).mean()
class_weights.loc['weight_25_1'] = limited_weight_25_1_df.sub(limited_orig_scan).mean()
class_weights.loc['weight_10_1'] = limited_weight_10_1_df.sub(limited_orig_scan).mean()
class_weights.loc['weight_5_1'] = limited_weight_5_1_df.sub(limited_orig_scan).mean()
class_weights.loc['weight_2_1'] = limited_weight_2_1_df.sub(limited_orig_scan).mean()
class_weights.loc['balanced'] = limited_balanced_df.sub(limited_orig_scan).mean()
class_weights.loc['weight_1_2'] = limited_weight_1_2_df.sub(limited_orig_scan).mean()
# class_weights.loc['weight_1_2'] = limited_weight_1_2_df.sub(limited_orig_scan).mean()
# class_weights.loc['pca'] = limited_orig_pca_df.sub(limited_orig_scan).mean()

class_weights

Unnamed: 0,Accuracy,Sensitivity,Specificity,MCC
weight_50_1,-0.077769,-0.169984,0.171738,-0.038169
weight_25_1,-0.095972,-0.202224,0.199973,-0.033314
weight_10_1,-0.059165,-0.129653,0.125822,-0.035778
weight_5_1,-0.041149,-0.093251,0.094518,-0.033954
weight_2_1,-0.0333,-0.079976,0.088788,-0.027315
balanced,-0.012515,-0.036833,0.043476,-0.002045
weight_1_2,-0.025402,-0.06081,0.060033,-0.033553


In [23]:
limited_ensemble_orig_scan = limited_orig_scan.loc[15:16].append(limited_orig_scan.loc[31:32])
limited_ensemble_orig_scan.index = range(1,5)

col_names =  ['Accuracy', 'Sensitivity', 'Specificity', 'MCC']
class_weights  = pd.DataFrame(columns = col_names)

class_weights.loc['scaled_soft'] = limited_scaled_ensemble_df.sub(limited_ensemble_orig_scan).mean()
class_weights.loc['scaled_hard'] = limited_scaled_ensemble_hard_df.sub(limited_ensemble_orig_scan).mean()
class_weights.loc['norm_hard'] = limited_norm_ensemble_hard_df.sub(limited_ensemble_orig_scan).mean()

class_weights





Unnamed: 0,Accuracy,Sensitivity,Specificity,MCC
scaled_soft,0.000612,-0.082319,0.139474,0.040147
scaled_hard,-0.025887,-0.067853,0.094298,0.019389
norm_hard,-0.027968,-0.062543,0.063816,-0.01669


In [None]:
# tmp = tmp_orig_pca_df.sub(tmp_orig_scan).mean()
# tmp_df = tmp_orig_pca_df ## pca 10
# tmp