# Generating_Results_from_Phase_1_Final_Models

# Inits

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import data_loading
import model_funcs

import joblib

## Definitions

In [2]:
writeup_metrics_list = ['train_accuracy','test_accuracy',
                        'test_specificity','test_sensitivity',
                        'train_auc','test_auc']

## Funcs

### Generating results for a single cv/holdout set

In [3]:
def generate_results_model_results(X, y, param_dict, i):
    # Split into CV/test set using target class to stratify
    X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)

    # Generate the results for this repeat
    curr_results_dict = model_funcs.run_experiment_without_mlflow(X_cv, X_test, y_cv, y_test, param_dict)
    
    # Return the results
    return curr_results_dict

### Print out summary metrics

In [4]:
def print_summary_metrics(results_dict_list):
    # For each summary metric for the writeup
    for curr_metric in writeup_metrics_list:
        # Create an empty list for the metric
        curr_metric_list = []

        # For each run
        for curr_results_dict in results_dict_list:
            # Append the run's metric value to the full list
            curr_metric_list.append(curr_results_dict[curr_metric])

        # Print the average metric value
        print(f'{curr_metric}: {np.mean(curr_metric_list):0.3f}')

### Calculate average confusion matrix

In [5]:
def calculate_avg_confusion_matrix(results_dict_list):
    
    full_confusion_matrix_df = pd.DataFrame({'0_pred': [0,0],
                                             '1_pred': [0,0]},
                                           index=['0_actual','1_actual'])

    for curr_results_dict in results_dict_list:
        full_confusion_matrix_df += curr_results_dict['confusion_matrix']

    # Calculate the average
    average_confusion_matrix_df = full_confusion_matrix_df / len(results_dict_list)
    
    # Printing out TP, FP, TN, FN
    print(f"Avg. Predictions: {average_confusion_matrix_df.sum().sum()}")
    print(f"Avg. TP: {average_confusion_matrix_df.loc['1_actual','1_pred']}")
    print(f"Avg. FP: {average_confusion_matrix_df.loc['0_actual','1_pred']}")
    print(f"Avg. TN: {average_confusion_matrix_df.loc['0_actual','0_pred']}")
    print(f"Avg. FN: {average_confusion_matrix_df.loc['1_actual','0_pred']}")

### Generate feature list

In [6]:
def get_feature_list(results_dict_list):
    full_feature_list = []

    for curr_results_dict in results_dict_list:
        full_feature_list.extend(curr_results_dict['feature_list'])
        
    # Remove any empty feature from the final list
    full_feature_ser = pd.Series([curr_feature for curr_feature in full_feature_list if len(curr_feature) > 0])

    return full_feature_ser

### Get the sorted, unique feature list

In [7]:
def get_unique_sorted_feature_list(results_dict_list):
    feature_ser = get_feature_list(results_dict_list)
    
    #print(join_str.join(feature_ser.value_counts().index.tolist()))
    
    return feature_ser.value_counts().index.tolist()

# Sub-Challenge 1

## SC1 Definitions

In [8]:
# Define the parameter grid
sc1_param_dict = {'C': 10**-1.75,
             'max_iter': 100,
             'solver': 'saga',
             'class_weight': None,
             'lower_quantile_removed_CoV': 0.25,
             'use_smote': True}

## Generating model results

In [None]:
%%time
# Load the gene expression (GE) raw data from file
X_sc1, y_sc1, phenotype_df = data_loading.load_sc1_data()

# Generate results for 15 different cv/test splits
sc1_results_dict_list = joblib.Parallel(n_jobs=-1, backend='multiprocessing')\
        (joblib.delayed(generate_results_model_results)(X_sc1, y_sc1, sc1_param_dict, i)\
             for i in range(15))

## Output results

In [10]:
# Average confusion matrix
calculate_avg_confusion_matrix(sc1_results_dict_list)

# Accuracy, AUC, etc.
print()
print_summary_metrics(sc1_results_dict_list)

# Feature list
print()
print(', '.join(get_unique_sorted_feature_list(sc1_results_dict_list)))

Avg. Predictions: 76.0
Avg. TP: 46.6
Avg. FP: 5.4
Avg. TN: 4.6
Avg. FN: 19.4

train_accuracy: 0.670
test_accuracy: 0.674
test_specificity: 0.460
test_sensitivity: 0.706
train_auc: 0.739
test_auc: 0.600

MTHFD2, IGFBP2, FERMT1, CHI3L1, LINC01088, IL13RA2, EMP3, RBP1, BMP2, PLAT, TMEM100, SYTL4, TIMP1, GOLGA8A, FLJ16779, NRN1, FAM110C, SNX10, SFRP2, ADM, CXCL14, CRNDE, PRKX, ARL4C, SELL, ENC1, SHD, HOXA5, CASTOR1, SLC14A1, EMX2, SERPINE1, CNTN3, NMB, DLL3, LPL, SNHG19, PDPN, SLC24A3, TOX3, PHLDA2, NKAIN4, PKIB, TMEM158, SRPX, LHFPL3, SNHG1, TPPP3, RCAN2, AKR1C3, BEST3, KLRC3, ARHGEF26.AS1, AMIGO2, BCHE, TENM2, LTF, MT1X, USP54, P2RY12, HOXC6, PCP4, SLC39A12, NEFH, MAOB, GBP1, C2orf27A, ETNPPL, GPX3, CAP2, NEFL, HAPLN1, SYT1, RIDA, ID1, CHI3L2, PTX3, ASS1, CCL2, ANXA1, COL1A1, H19, GJB2, BCAN, FXYD1, ESM1, HOXC10, COL4A2, SPON1, GPR34, LRRN1, COL1A2, PDE8B, LOXL1, SCG3, STOX1, COL4A1, CSRP2, LRP1B, TRDC, COL3A1, CNDP1, NXPH1, CP, ZBTB16, MERTK, RNF180, PCDHB7, LRP4, SERPING1, ADAMTS1, OLI

# Sub-Challenge 2

## SC2 Definitions

In [11]:
# Define the parameter grid
sc2_param_dict = {'C': 10**-1.25,
             'max_iter': 100,
             'solver': 'saga',
             'class_weight': None,
             'lower_quantile_removed_CoV': 0.25,
             'use_smote': True}

## Generating model results

In [None]:
%%time
# Load the gene expression (GE) raw data from file
X_sc2, y_sc2, phenotype_df = data_loading.load_sc2_data()

# Generate results for 15 different cv/test splits
sc2_results_dict_list = joblib.Parallel(n_jobs=-1, backend='multiprocessing')\
        (joblib.delayed(generate_results_model_results)(X_sc2, y_sc2, sc2_param_dict, i)\
             for i in range(15))

## Output results

In [13]:
# Average confusion matrix
calculate_avg_confusion_matrix(sc2_results_dict_list)

# Accuracy, AUC, etc.
print()
print_summary_metrics(sc2_results_dict_list)

# Feature list
print()
print(', '.join(get_unique_sorted_feature_list(sc2_results_dict_list)))

Avg. Predictions: 35.0
Avg. TP: 14.266666666666667
Avg. FP: 3.2
Avg. TN: 5.8
Avg. FN: 11.733333333333333

train_accuracy: 0.657
test_accuracy: 0.573
test_specificity: 0.644
test_sensitivity: 0.549
train_auc: 0.705
test_auc: 0.630

9p21.3, 7q31.1, 1p31.3, 7p15.3, 10p15.3, 8q24.13, 7p15.1, 8q23.1, 7q31.2, 9p24.1, 21q21.1, 1p31.1


# Sub-Challenge 3

## SC3 Definitions

In [14]:
# Define the parameter grid
sc3_param_dict = {'C': 10**-1.25,
                  'max_iter': 100,
                  'solver': 'saga',
                  'class_weight': None,
                  'lower_quantile_removed_CoV': 0.25,
                  'use_smote': True,
                  'feature_list_name': 'trunc_genes_trunc_cnvs'}

## Generating model results

In [None]:
%%time
# Load the gene expression (GE) raw data from file
X_sc3, y_sc3, phenotype_df = data_loading.load_sc3_data()

# Get the feature lists from SC1 and SC2
sc3_feature_list = get_unique_sorted_feature_list(sc1_results_dict_list) + get_unique_sorted_feature_list(sc2_results_dict_list)

# Subset the SC3 X features
X_sc3 = X_sc3[sc3_feature_list]

# Generate results for 15 different cv/test splits
sc3_results_dict_list = joblib.Parallel(n_jobs=-1, backend='multiprocessing')\
        (joblib.delayed(generate_results_model_results)(X_sc3, y_sc3, sc3_param_dict, i)\
             for i in range(15))

## Output results

In [16]:
# Average confusion matrix
calculate_avg_confusion_matrix(sc3_results_dict_list)

# Accuracy, AUC, etc.
print()
print_summary_metrics(sc3_results_dict_list)

# Feature list
print()
print(', '.join(get_unique_sorted_feature_list(sc3_results_dict_list)))

Avg. Predictions: 34.0
Avg. TP: 21.333333333333332
Avg. FP: 4.066666666666666
Avg. TN: 3.933333333333333
Avg. FN: 4.666666666666667

train_accuracy: 0.806
test_accuracy: 0.743
test_specificity: 0.492
test_sensitivity: 0.821
train_auc: 0.892
test_auc: 0.734

STMN2, SYTL4, LINC01088, CHI3L1, RNF180, 9p21.3, EMX2, SFRP2, MN1, TIMP1, CXCL14, SLC39A12, TMEM158, BCHE, TMEM100, LY6H, ID1, ABI3BP, ADM, SLC14A1, HOXA5, PLAT, IGFBP2, CRLF1, HOXC10, MTHFD2, NRN1, PLK2, LTF, CHST9, LINC00844, IGKC, CENPV, LPL, SYT1, TENM2, HOXC6, 7q31.1, ASS1, DNM1, C9orf24, KIAA1211, SPON1, SLC24A3, KIF4A, RBP1, NEFH, SCN2A, POSTN, LRP1B, ESM1, SLC12A5, GDA, CNTN3, ST18, PLA2G2A, ETNPPL, TAC1, XIST, TOX3, NEFL, NTN4, SYNPR, GJB2, OLFM1, PRKX, GPR34, GRB14, 8q23.1, OLIG2, COL4A1, FCGBP, BCAN, RRM2, PBK, ETV1, STOX1, CNDP1, NEFM, IRX1, AMPH, CCK, HAPLN1, LRRN1, COL4A2, PCDHB16, FERMT1, 7p15.3, CX3CR1, SERPINE1, MYLK, TOP2A, TRIL, PCDHB7, P2RY12
