# Training_a_logistic_regression_model_for_sub-challenge_3

**Background:**

**Purpose:**

**Methods:**
>1. Introduction
>2. Inits

**Conclusions:**

# Inits

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context='notebook', font_scale=1.5)

import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import LogisticRegression

import mlflow

# Load my own custom module
import data_loading
import model_funcs
import constants

import imblearn
import joblib

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import sklearn.metrics as skl_metrics

from plotnine import *

  from collections import Mapping


## Definitions

In [2]:
experiment_base_path = f'{constants.base_dir}/mlruns/'

test_param_dict = {'C': 1e-1,
                 'max_iter': 100,
                 'solver': 'saga',
                 'class_weight': None,
                 'lower_quantile_removed_CoV': 0,
                  'use_smote': True}

## Funcs

# Loading SC3 data

In [None]:
sc3_Phase1_CN_GE_FeatureMatrix = pd.read_csv(f'{constants.base_dir}/data/raw/sc3_Phase1_CN_GE_FeatureMatrix.tsv',
                                          sep='\t') \
    .set_index('PATIENTID')
sc3_Phase1_CN_GE_Outcome = pd.read_csv(f'{constants.base_dir}/data/raw/sc3_Phase1_CN_GE_Outcome.tsv', sep='\t') \
    .set_index('PATIENTID')
sc3_Phase1_CN_GE_Phenotype = pd.read_csv(f'{constants.base_dir}/data/raw/sc3_Phase1_CN_GE_Phenotype.tsv', sep='\t') \
    .set_index('PATIENTID')

# TEST - First model for SC3

In [None]:
%%time

# Set the experiment name
mlflow.set_tracking_uri(f'file://{experiment_base_path}')
mlflow.set_experiment('SC3-V10-TEST-L1-Logistic_Regression_Exp')

# Load the gene expression (GE) raw data from file
X, y, phenotype_df = data_loading.load_sc3_data()

# Split into CV/test set using target class to stratify
X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model_funcs.run_mlflow_exp_V10(X_cv, X_test, y_cv, y_test, test_param_dict, experiment_base_path)

# Exp 1 - First Grid Search for SC3

In [None]:
np.log10(np.logspace(-2, 0, 9).tolist()).astype(str)

In [None]:
%%time

# Set the experiment name
mlflow.set_tracking_uri(f'file://{experiment_base_path}')
mlflow.set_experiment('SC3-V10-Exp_1-L1-Logistic_Regression_Exp')
    
# Define the parameter grid
param_dict = {'C': np.logspace(-2, 0, 9).tolist() + np.logspace(1, 5, 3).tolist(),
             'max_iter': [100],
             'solver': ['saga'],
             'class_weight': [None],
             'lower_quantile_removed_CoV': [0, 0.25, 0.5, 0.9],
             'use_smote': [True]}

# Load the gene expression (GE) raw data from file
X, y, phenotype_df = data_loading.load_sc3_data()

# For 10 different cv/test splits
for i in range(10):
    # Split into CV/test set using target class to stratify
    X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    #model_funcs.run_mlflow_exp_V10(X_cv, X_test, y_cv, y_test, test_param_dict, experiment_base_path)

    # Run mlflow runs in parallel
    results = joblib.Parallel(n_jobs=-1, backend='multiprocessing')\
        (joblib.delayed(model_funcs.run_mlflow_exp_V10)(X_cv, X_test, y_cv, y_test, curr_params_dict, experiment_base_path)\
             for curr_param_index, curr_params_dict in enumerate(ParameterGrid(param_dict)))

## Making a dataframe with experiment results

In [None]:
# Get the experiment id from the experiment name
curr_experiment_id=mlflow.get_experiment_by_name('SC3-V10-Exp_1-L1-Logistic_Regression_Exp').experiment_id

experiment_results_sc3_v10_exp1_df = mlflow.search_runs(experiment_ids=curr_experiment_id)\
    .rename(columns={'metrics.cv_test_accuracy_of_0_cases_list': 'metrics.cv_test_0_class_accuracy'})

# Melt the dataframe to get accuracy as a single column
experiment_results_sc3_v10_exp1_df_melt = experiment_results_sc3_v10_exp1_df\
    .melt(id_vars=['run_id','metrics.count_non_zero_features','params.lower_quantile_removed_CoV','params.log10_C','params.log10_max_iter','params.use_smote'],
          value_vars=['metrics.cv_training_accuracy','metrics.cv_test_accuracy','metrics.cv_test_0_class_accuracy'],
          var_name='train_or_cv',
          value_name='accuracy')

experiment_results_sc3_v10_exp1_df_melt['metrics.log10_count_non_zero_features'] = np.log10(experiment_results_sc3_v10_exp1_df_melt['metrics.count_non_zero_features'] + 1)

## Plotting 

In [None]:
plot_df = experiment_results_sc3_v10_exp1_df_melt\
    .loc[experiment_results_sc3_v10_exp1_df_melt['metrics.count_non_zero_features'] > 1]


sns.lmplot(x='metrics.log10_count_non_zero_features',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv', 
           fit_reg=True, size=5, aspect=1.25)

ax = plt.gca()
#ax.set_xscale('log')
#ax.axvline(3, c='r', alpha=0.3)

In [None]:
plot_df = experiment_results_sc3_v10_exp1_df_melt\
#    .loc[experiment_results_sc2_v10_exp2_df_melt['metrics.count_non_zero_features'] > 1]

plt.figure(figsize=(12,6))
ax=plt.gca()

sns.boxplot(x='params.log10_C',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv', 
           order=['-2.0', '-1.75', '-1.5', '-1.25', '-1.0', '-0.75', '-0.5', '-0.25', 
                  '0.0', '1.0', '3.0', '5.0'],
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
#ax.axhline(0.5, c='g')
ax.axvline(4)

This shows log10_C = -1.0 to -0.75 as good options that balance class 0 performance with overall test performance

In [None]:
plot_df = experiment_results_sc3_v10_exp1_df_melt\
    .loc[experiment_results_sc3_v10_exp1_df_melt['params.log10_C'].isin(['-0.75','-1.0'])]

plt.figure(figsize=(12,5))
ax=plt.gca()

sns.boxplot(x='params.lower_quantile_removed_CoV',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv',
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
ax.axhline(0.74, c='orange')
#ax.axvline(1)

There is no effect with the lower quantiles being removed. I suppose this one makes less sense because the CNVs and gene expression values are on different scales.

Let's look at the accuracy vs. log10_C again with a single quantile value removed

In [None]:
plot_df = experiment_results_sc3_v10_exp1_df_melt\
    .loc[experiment_results_sc3_v10_exp1_df_melt['params.lower_quantile_removed_CoV'] == '0']

plt.figure(figsize=(12,6))
ax=plt.gca()

sns.boxplot(x='params.log10_C',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv', 
           order=['-2.0', '-1.75', '-1.5', '-1.25', '-1.0', '-0.75', '-0.5', '-0.25', 
                  '0.0', '1.0', '3.0', '5.0'],
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
#ax.axhline(0.5, c='g')
ax.axvline(4)

log10_C=-1.0 looks a bit better because the class 0 performance starts to drop off at log10_C=-0.75

# Exp 2 - Using truncated lists of features from SC1

## Defining the MLflow experiment code

In [3]:
def __generate_gene_list_with_lowest_cov_quantile(X_cv, q):
    # Calculate the CoV
    gene_expr_cov_list = 100 * (X_cv.std(ddof=1, axis=0) / X_cv.mean(axis=0))

    # Calculate the 0.25 quantile
    min_cov_threshold = np.quantile(gene_expr_cov_list, q=[q])[0]

    # Get a list of genes to removed
    low_cov_gene_list = gene_expr_cov_list.loc[(gene_expr_cov_list < min_cov_threshold)].index.tolist()

    return low_cov_gene_list

def __remove_lowest_quantile_of_genes_based_on_cov(X_train, X_test, q):
    # Remove the lowest quartile of genes based on CoV
    low_cov_gene_list = __generate_gene_list_with_lowest_cov_quantile(X_train, q=q)

    # Remove the lowest quantile of genes based on CoV
    return X_train.drop(low_cov_gene_list, axis=1), X_test.drop(low_cov_gene_list, axis=1)

def __smote_oversample(X, y):
    smote_obj = imblearn.over_sampling.SMOTE(sampling_strategy='auto',
                                             random_state=110,
                                             k_neighbors=5,
                                             n_jobs=None)

    X_smote, y_smote = smote_obj.fit_sample(X, y)

    return X_smote, y_smote

def __calculate_model_accuracy(clf, X, y):
    # Calculate accuracy on non-oversampled data
    y_pred = pd.Series(clf.predict(X), index=y.index)

    return (y_pred == y).mean()

def __split_kfold_data(X, y, train_iloc_list, test_iloc_list):
    # Create current fold train data
    X_train = X.iloc[train_iloc_list,]
    y_train = y.iloc[train_iloc_list,]

    # Create current fold train data
    X_test = X.iloc[test_iloc_list,]
    y_test = y.iloc[test_iloc_list,]

    return X_train, X_test, y_train, y_test

def __generate_cv_metrics(clf, X_cv, y_cv, param_dict):
    cv_train_accuracy_list = []
    cv_test_accuracy_list = []
    cv_test_accuracy_of_0_cases_list = []
    y_cv_test_list = []
    y_cv_test_pred_prob_list = []

    kfold_obj = KFold(n_splits=5, shuffle=True, random_state=110)

    for curr_train_iloc_list, curr_test_iloc_list in kfold_obj.split(X=X_cv, y=y_cv):

        # Get the train/test data for this fold
        curr_fold_X_train, \
        curr_fold_X_test, \
        curr_fold_y_train, \
        curr_fold_y_test = __split_kfold_data(X_cv, y_cv, curr_train_iloc_list, curr_test_iloc_list)

        # Run SMOTE on the X,y data
        if param_dict['use_smote']:
            curr_fold_X_train, curr_fold_y_train = __smote_oversample(curr_fold_X_train, curr_fold_y_train)

        # Train the model for this k-fold
        clf.fit(curr_fold_X_train, curr_fold_y_train)

        # Calculate accuracy on non-oversampled data
        cv_train_accuracy_list.append(__calculate_model_accuracy(clf, curr_fold_X_train, curr_fold_y_train))
        cv_test_accuracy_list.append(__calculate_model_accuracy(clf, curr_fold_X_test, curr_fold_y_test))

        # Calculate the test accuracy of target=0 cases
        curr_fold_y_test_pred = pd.Series(clf.predict(curr_fold_X_test), index=curr_fold_y_test.index)
        cv_test_accuracy_of_0_cases_list.append(np.mean(curr_fold_y_test_pred[(curr_fold_y_test == 0)] == 0))

        # Calculate the prediction probability
        y_cv_test_list.extend(curr_fold_y_test.tolist())
        y_cv_test_pred_prob_list.extend(clf.predict_proba(curr_fold_X_test)[:, 1])

    # Calculate AUC of the CV test folds
    cv_test_auc = skl_metrics.roc_auc_score(y_true=y_cv_test_list,
                                            y_score=y_cv_test_pred_prob_list)

    return cv_train_accuracy_list, cv_test_accuracy_list, cv_test_accuracy_of_0_cases_list, cv_test_auc

def __calculate_sens_and_spec(y_true, y_pred):
    # Generate a classification report dict
    classification_report_dict = skl_metrics.classification_report(y_true,
                                                                   y_pred,
                                                                   output_dict=True)
    # Get the sensitivity and specificity from the report dict
    sensitivity = classification_report_dict['1']['recall']
    specificity = classification_report_dict['0']['recall']

    return sensitivity, specificity


def run_mlflow_exp_V11(X_cv, X_test, y_cv, y_test, curr_params_dict, experiment_base_path):

    # Set the experiment name
    mlflow.set_tracking_uri(f'file://{experiment_base_path}')

    # Start a mlflow run
    with mlflow.start_run() as mlflow_run:
        mlflow.set_tag('description', "")

        # Define the model
        log_regr_clf = LogisticRegression(penalty='l1',
                                          C=curr_params_dict['C'],
                                          class_weight=curr_params_dict['class_weight'],
                                          random_state=110,
                                          solver=curr_params_dict['solver'],
                                          max_iter=curr_params_dict['max_iter'],
                                          verbose=0,
                                          n_jobs=None,
                                          l1_ratio=None)
        # Remove the lowest q quantile of genes based on their CoV
        X_cv, X_test = __remove_lowest_quantile_of_genes_based_on_cov(X_train=X_cv,
                                                                      X_test=X_test,
                                                                      q=curr_params_dict['lower_quantile_removed_CoV'])
        
        # Truncate the features
        X_cv = X_cv[curr_params_dict['features']['feature_list']]
        X_test = X_test[curr_params_dict['features']['feature_list']]

        # Run custom CV so I can train on over-sampled data but test on original data
        cv_train_accuracy_list,\
        cv_test_accuracy_list,\
        cv_test_accuracy_of_0_cases_list,\
        cv_test_auc = __generate_cv_metrics(log_regr_clf, X_cv, y_cv, curr_params_dict)

        # Run SMOTE on the full X_cv,y_cv data
        if curr_params_dict['use_smote']:
            X_cv, y_cv = __smote_oversample(X_cv, y_cv)

        # Train the model on the full CV data
        log_regr_clf.fit(X_cv, y_cv)
        
        y_test_pred = log_regr_clf.predict(X_test)

        # Log parameters
        mlflow.log_param("log10_C", np.log10(curr_params_dict['C']))
        mlflow.log_param("log10_max_iter", np.log10(curr_params_dict['max_iter']))
        mlflow.log_param("solver", curr_params_dict['solver'])
        mlflow.log_param("class_weight", curr_params_dict['class_weight'])
        mlflow.log_param("lower_quantile_removed_CoV", curr_params_dict['lower_quantile_removed_CoV'])
        mlflow.log_param("use_smote", curr_params_dict['use_smote'])
        mlflow.log_param("feature_list_name", curr_params_dict['features']['name'])

        # Log the CV metrics
        mlflow.log_metric("cv_training_accuracy", np.mean(cv_train_accuracy_list))
        mlflow.log_metric("cv_test_accuracy", np.mean(cv_test_accuracy_list))
        mlflow.log_metric("cv_test_0_class_accuracy", np.mean(cv_test_accuracy_of_0_cases_list))

        # Log test metrics
        mlflow.log_metric("train_accuracy", __calculate_model_accuracy(log_regr_clf, X_cv, y_cv))
        mlflow.log_metric("test_accuracy", __calculate_model_accuracy(log_regr_clf, X_test, y_test))
        mlflow.log_metric("train_auc", skl_metrics.roc_auc_score(y_true=y_cv,
                                                                 y_score=log_regr_clf.predict_proba(X_cv)[:, 1]))
        mlflow.log_metric("cv_test_auc", cv_test_auc)
        mlflow.log_metric("test_auc", skl_metrics.roc_auc_score(y_true=y_test,
                                                                y_score=log_regr_clf.predict_proba(X_test)[:, 1]))
        
        # Sensitivity/specificity
        test_sensitivity, \
        test_specificity = __calculate_sens_and_spec(y_true=y_test, y_pred=y_test_pred)
        
        mlflow.log_metric("test_sensitivity", test_sensitivity)
        mlflow.log_metric("test_specificity", test_specificity)

        # Log count of non-zero features
        mlflow.log_metric("count_non_zero_features", np.sum(log_regr_clf.coef_[0] != 0))

        # Define the path for outputting the feature list
        feature_list_output_path = f'/tmp/mlflow_artifacts/{mlflow_run.info.run_id}/feature_list.txt'

        # Create the temporary path to store artifacts
        os.makedirs(os.path.dirname(feature_list_output_path))

        # Write the feature list to file
        with open(feature_list_output_path, 'w') as f:
            for curr_feature in X_cv.columns[(log_regr_clf.coef_[0] != 0)]:
                f.write(curr_feature + "\n")

        # Create dataframe with holdout estimates and actual y-values
        mlflow.log_artifact(feature_list_output_path)
        
        # Confusion matrix
        # Define the path for outputting the feature list
        confusion_output_path = f'/tmp/mlflow_artifacts/{mlflow_run.info.run_id}/confusion_matrix.csv'

        confusion_matrix = skl_metrics.confusion_matrix(y_true=y_test, y_pred=y_test_pred)

        # Output a dataframe with the confusion matrix results
        pd.DataFrame(confusion_matrix,
                     columns=['0_pred','1_pred'],
                     index=['0_actual','1_actual'])\
            .to_csv(confusion_output_path)

        # Log the feature list artifact
        mlflow.log_artifact(confusion_output_path)

## Loading data

In [4]:
# Load the gene expression (GE) raw data from file
X, y, phenotype_df = data_loading.load_sc3_data()

feature_list_full_CN_GE = X.columns.tolist()

# Get the full gene list
X_sc1, _, _ = data_loading.load_sc1_data()
feature_list_full_GE = X_sc1.columns.tolist()
del X_sc1

# Get the full cnv list
X_sc2, _, _ = data_loading.load_sc2_data()
feature_list_full_CN = X_sc2.columns.tolist()
del X_sc2

# Load the truncated gene feature list
feature_list_trunc_GE = pd.read_csv('./../../data/intermediate/GE_feature_list.txt', header=None)[0].values.tolist()
feature_list_trunc_CN = pd.read_csv('./../../data/intermediate/CN_feature_list.txt', header=None)[0].values.tolist()

## Running MLFlow

In [5]:
%%time

# Set the experiment name
mlflow.set_tracking_uri(f'file://{experiment_base_path}')
mlflow.set_experiment('SC3-V10-Exp_2-L1-Logistic_Regression_Exp')

# Define the parameter grid
param_dict = {'C': np.logspace(-2, 0, 9).tolist() + np.logspace(1, 5, 3).tolist(),
             'max_iter': [100],
             'solver': ['saga'],
             'class_weight': [None],
             'lower_quantile_removed_CoV': [0],
             'use_smote': [True],
             'features': [
                 {
                     'name': 'full',
                     'feature_list': feature_list_full_CN_GE
                 },{
                     'name': 'trunc_genes_full_cnvs',
                     'feature_list': (feature_list_trunc_GE + feature_list_full_CN)
                 },{
                     'name': 'full_genes_trunc_cnvs',
                     'feature_list': (feature_list_full_GE + feature_list_trunc_CN)
                 },{
                     'name': 'trunc_genes_trunc_cnvs',
                     'feature_list': (feature_list_trunc_GE + feature_list_trunc_CN)
                 },
                ]
             }

# For 10 different cv/test splits
for i in range(15):
    # Split into CV/test set using target class to stratify
    X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)

    #model_funcs.run_mlflow_exp_V10(X_cv, X_test, y_cv, y_test, test_param_dict, experiment_base_path)

    # Run mlflow runs in parallel
    results = joblib.Parallel(n_jobs=-1, backend='multiprocessing')\
        (joblib.delayed(run_mlflow_exp_V11)(X_cv, X_test, y_cv, y_test, curr_params_dict, experiment_base_path)\
             for curr_param_index, curr_params_dict in enumerate(ParameterGrid(param_dict)))

INFO: 'SC3-V10-Exp_2-L1-Logistic_Regression_Exp' does not exist. Creating a new experiment














  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




































  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))










  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
















  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))














  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))














  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))










  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))






  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))








  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))




CPU times: user 8.58 s, sys: 3.13 s, total: 11.7 s
Wall time: 20min 17s


## Generating a dataframe with experimental results

In [None]:
# Get the experiment id from the experiment name
curr_experiment_id=mlflow.get_experiment_by_name('SC3-V10-Exp_2-L1-Logistic_Regression_Exp').experiment_id

experiment_results_sc3_v10_exp2_df = mlflow.search_runs(experiment_ids=curr_experiment_id)\
    .rename(columns={'metrics.cv_test_accuracy_of_0_cases_list': 'metrics.cv_test_0_class_accuracy'})

# Melt the dataframe to get accuracy as a single column
experiment_results_sc3_v10_exp2_df_melt = experiment_results_sc3_v10_exp2_df\
    .melt(id_vars=['run_id','metrics.count_non_zero_features','params.lower_quantile_removed_CoV',
                   'params.log10_C','params.log10_max_iter','params.use_smote','params.feature_list_name'],
          value_vars=['metrics.cv_training_accuracy','metrics.cv_test_accuracy','metrics.cv_test_0_class_accuracy'],
          var_name='train_or_cv',
          value_name='accuracy')

experiment_results_sc3_v10_exp2_df_melt['metrics.log10_count_non_zero_features'] = np.log10(experiment_results_sc3_v10_exp2_df_melt['metrics.count_non_zero_features'] + 1)

## Plotting results

### Feature list vs. Accuracy

In [None]:
plot_df = experiment_results_sc3_v10_exp2_df_melt

plt.figure(figsize=(12,6))
ax=plt.gca()

sns.boxplot(x='params.feature_list_name',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv', 
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
ax.axhline(0.55, c='g')
#ax.axvline(4)
plt.xticks(rotation=45)

This shows that truncated GE and CN has nearly the same performance as the others.

### C vs. accuracy

In [None]:
plot_df = experiment_results_sc3_v10_exp2_df_melt\
    .loc[experiment_results_sc3_v10_exp2_df_melt['params.feature_list_name'] == 'trunc_genes_trunc_cnvs']

plt.figure(figsize=(12,6))
ax=plt.gca()

sns.boxplot(x='params.log10_C',
           y='accuracy',
           data=plot_df,
           hue='train_or_cv', 
           order=['-2.0', '-1.75', '-1.5', '-1.25', '-1.0', '-0.75', '-0.5', '-0.25', 
                  '0.0', '1.0', '3.0', '5.0'],
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
ax.axhline(0.55, c='g')
ax.axhline(0.75, c='orange')
ax.axvline(3)

There is a slight improvement in CV test accuracy from log10_C=-1.75,-1.5 to -1.25

### C vs. final feature list size

In [None]:
plot_df = experiment_results_sc3_v10_exp2_df_melt\
    .loc[experiment_results_sc3_v10_exp2_df_melt['params.feature_list_name'] == 'trunc_genes_trunc_cnvs']

plt.figure(figsize=(12,6))
ax=plt.gca()

sns.boxplot(x='params.log10_C',
           y='metrics.count_non_zero_features',
           data=plot_df,
           order=['-2.0', '-1.75', '-1.5', '-1.25', '-1.0', '-0.75', '-0.5', '-0.25', 
                  '0.0', '1.0', '3.0', '5.0'],
           ax=ax)

ax.legend(bbox_to_anchor=(1.05,1))
ax.axhline(0.5, c='g')
#ax.axvline(4)

* At log10_C=-1.5, there were ~15 genes and no CNVs
* At log10_C=-1.25, there were ~36 genes and 3 CNVs
* At log10_C=-1.0, there were ~60 genes
* At log10_C=-0.5, there were ~158 genes


* Therefore, I will use log10_C=-1.25 for the SC2 model with truncated GE and CN
   * This is because it truncates the gene list significantly while retaining some CVs

## Getting the final list of features for all SC3 runs

In [None]:
import glob

# Get the full run_id list for this experiment
run_id_list = experiment_results_sc3_v10_exp2_df\
    .loc[(experiment_results_sc3_v10_exp2_df['params.log10_C'] == '-1.25')
        & (experiment_results_sc3_v10_exp2_df['params.feature_list_name'] == 'trunc_genes_trunc_cnvs')]\
    ['run_id']

full_feature_list = []

for curr_run_id in run_id_list:
    # Get the current run's artifact uri
    curr_run = mlflow.get_run(run_id=curr_run_id)
    curr_artifact_uri = curr_run.to_dictionary()['info']['artifact_uri']

    # Get the feature list filepath
    feature_list_filepath = glob.glob(f'{curr_artifact_uri[7:]}/*')[0]
    
    # Open the feature list file
    with open(feature_list_filepath, 'r') as f:
        # Get the first feature
        curr_feature = f.readline()

        # While the current feature name is not empty
        while curr_feature != '':
            # Add the feature to the full feature list
            full_feature_list.append(f.readline().strip())
            
            # Get the next feature
            curr_feature = f.readline()

# Remove any empty feature from the final list
full_feature_ser = pd.Series([curr_feature for curr_feature in full_feature_list if len(curr_feature) > 0])

for i in full_feature_ser.unique().tolist():
    print(i)

## Output the final SC3 list

In [None]:
with open('./../../data/intermediate/sc3_feature_list.txt', 'w') as f:
    for curr_feature in np.sort(full_feature_ser.unique()):
        if len(curr_feature) > 0:
            f.write(f'{curr_feature}\n')

In [None]:
for i in pd.read_csv('../../data/intermediate/GE_feature_list.txt', header=None)[0].unique()\
    .tolist():
    print(i)