# MLPClassifier

In [1]:
import numpy as np
import pandas as pd
import sklearn.neural_network
import sklearn.model_selection
import scipy
import pprint

## Load data

In [2]:
X = pd.read_csv("inputs_cleaned.tsv", sep="\t", index_col=0)
y = pd.read_csv("targets_cleaned.tsv", sep="\t", index_col=0)

## Inputs: Cast booleans to ints and one-hot encode categorical features

In [3]:
# Only categorical column is cancer type. We'll one-hot encode it.
cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns="cancer_type")
X = cancer_type_one_hot.join(X)

# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)

X

Unnamed: 0,ccrcc,endometrial,hnscc,lscc,luad,above_reg_line_ACSS1_Normal,above_reg_line_ACSS1_Tumor,above_reg_line_C1orf116_Normal,above_reg_line_C1orf116_Tumor,above_reg_line_CARD9_Normal,...,tumor_normal_residual_diff_PPM1L,tumor_normal_residual_diff_RIPK3,tumor_normal_residual_diff_RNASET2,tumor_normal_residual_diff_SLC1A5,tumor_normal_residual_diff_TALDO1,tumor_normal_residual_diff_THADA,tumor_normal_residual_diff_TXLNG,tumor_normal_residual_diff_USP7,tumor_normal_residuals_corr,prot_RNA_tumor_normal_ratios_corr
C3L-00004,1,0,0,0,0,1,0,1,0,1,...,0.286438,0.325162,0.538933,0.330835,0.374112,0.427240,0.467547,0.277375,0.303139,0.157994
C3L-00010,1,0,0,0,0,0,0,1,0,0,...,0.286438,0.325162,0.318034,0.330835,0.374448,0.427240,0.467547,0.218244,0.401246,0.179302
C3L-00011,1,0,0,0,0,0,1,0,1,0,...,0.286438,0.325162,0.778998,0.330835,0.440563,0.427240,0.467547,0.293224,0.318544,0.216114
C3L-00026,1,0,0,0,0,0,1,1,1,0,...,0.286438,0.325162,0.679723,0.330835,0.531879,0.427240,0.467547,0.184696,0.000000,0.204983
C3L-00079,1,0,0,0,0,0,1,0,1,1,...,0.286438,0.325162,0.000000,0.330835,0.397145,0.427240,0.467547,0.262287,0.824186,0.000000
C3L-00088,1,0,0,0,0,0,0,1,0,1,...,0.286438,0.325162,0.577122,0.330835,0.341088,0.427240,0.467547,0.194954,0.053779,0.149606
C3L-00096,1,0,0,0,0,1,0,0,1,1,...,0.286438,0.325162,0.392330,0.330835,0.439760,0.427240,0.467547,0.210382,0.616069,0.162841
C3L-00097,1,0,0,0,0,0,1,0,0,1,...,0.286438,0.325162,0.576071,0.330835,0.404275,0.427240,0.467547,0.472222,0.102287,0.233811
C3L-00103,1,0,0,0,0,0,0,1,0,0,...,0.286438,0.325162,0.527244,0.330835,0.263422,0.427240,0.467547,0.443753,0.303190,0.244661
C3L-00360,1,0,0,0,0,1,0,1,0,1,...,0.286438,0.325162,0.283073,0.330835,0.425698,0.427240,0.467547,0.251054,0.166419,0.183001


## Targets: One-hot encode and split into a map of tables so we can do one target at a time

In [4]:
ys = {}

for col in y.columns:
    ys[col] = pd.get_dummies(y[col])

## Baseline accuracy

In [5]:
def baseline(X, ys):
    
    results = {}
    for target, y in ys.items():
        mlp = sklearn.neural_network.MLPClassifier()
        results[target] = sklearn.model_selection.cross_validate(mlp, X, y, cv=10)
        
    return results

In [6]:
baseline_acc = baseline(X, ys)





In [7]:
for target, res in baseline_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.77
                          Survial status (1, dead; 0, alive): 0.78
                                            histologic_grade: 0.33
                                             histologic_type: 0.45
   measure_of_success_of_outcome_at_last_available_follow-up: 0.24
                            pathologic_staging_primary_tumor: 0.34


## Optimize

In [8]:
def random_search(
    X, 
    ys,
    hidden_layer_width_range,
    num_hidden_layers_range,
    activation_funcs,
    alpha_range,
    learning_rate_range,
    momentum_range,
    n_iter
):
    
    # Generate hidden layer sizes
    rng = np.random.default_rng(0)
    hidden_layer_sizes = (rng.integers(*hidden_layer_width_range),) * rng.integers(*num_hidden_layers_range)
    
    scores = {}
    params = {}
    for target, y in ys.items():
        mlp = sklearn.neural_network.MLPClassifier()
        search = sklearn.model_selection.RandomizedSearchCV(
            estimator=mlp,
            param_distributions={
                "hidden_layer_sizes": hidden_layer_sizes,
                "activation": activation_funcs,
                "alpha": scipy.stats.uniform(loc=alpha_range[0], scale=alpha_range[1] - alpha_range[0]),
                "learning_rate_init": scipy.stats.uniform(loc=learning_rate_range[0], scale=learning_rate_range[1] - learning_rate_range[0]),
                "momentum": scipy.stats.uniform(loc=momentum_range[0], scale=momentum_range[1] - momentum_range[0]),
            },
            n_iter=n_iter,
            random_state=0,
        ).fit(X, y)
        
        scores[target] = search.best_score_
        params[target] = search.best_params_
        
    return scores, params

In [9]:
optimized_acc, optimized_params = random_search(
    X, 
    ys,
    hidden_layer_width_range=(2, 32),
    num_hidden_layers_range=(1, 10),
    activation_funcs=["identity", "logistic", "tanh", "relu"],
    alpha_range=(1e-6, 1),
    learning_rate_range=(1e-3, 0.5),
    momentum_range=(0.1, 1),
    n_iter=10,
)



In [10]:
for target, res in optimized_acc.items():
    print(f"{target: >60}: {round(res, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.37
                                             histologic_type: 0.30
   measure_of_success_of_outcome_at_last_available_follow-up: 0.42
                            pathologic_staging_primary_tumor: 0.39


Try again with more iterations and larger hidden node ranges.

In [11]:
optimized_acc, optimized_params = random_search(
    X, 
    ys,
    hidden_layer_width_range=(2, 64),
    num_hidden_layers_range=(1, 100),
    activation_funcs=["identity", "logistic", "tanh", "relu"],
    alpha_range=(1e-6, 1),
    learning_rate_range=(1e-3, 0.5),
    momentum_range=(0.1, 1),
    n_iter=100,
)













In [12]:
for target, res in optimized_acc.items():
    print(f"{target: >60}: {round(res, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.47
                                             histologic_type: 0.40
   measure_of_success_of_outcome_at_last_available_follow-up: 0.49
                            pathologic_staging_primary_tumor: 0.46


In [13]:
for target, res in optimized_params.items():
    print(f"{target}:")
    pprint.pprint(res)
    print()

Recurrence status (1, yes; 0, no):
{'activation': 'logistic',
 'alpha': 0.6235640732222756,
 'hidden_layer_sizes': 54,
 'learning_rate_init': 0.21935601842008356,
 'momentum': 0.9025957007038717}

Survial status (1, dead; 0, alive):
{'activation': 'tanh',
 'alpha': 0.6078310608847991,
 'hidden_layer_sizes': 54,
 'learning_rate_init': 0.28552818843852196,
 'momentum': 0.4947413621160883}

histologic_grade:
{'activation': 'logistic',
 'alpha': 0.8546137256971341,
 'hidden_layer_sizes': 54,
 'learning_rate_init': 0.3400169939757851,
 'momentum': 0.5083271601004408}

histologic_type:
{'activation': 'logistic',
 'alpha': 0.1294132080099139,
 'hidden_layer_sizes': 54,
 'learning_rate_init': 0.2723591690885859,
 'momentum': 0.5112202794811892}

measure_of_success_of_outcome_at_last_available_follow-up:
{'activation': 'logistic',
 'alpha': 0.6462647910502476,
 'hidden_layer_sizes': 54,
 'learning_rate_init': 0.13953672444107124,
 'momentum': 0.6281059118123519}

pathologic_staging_primary_tumo

We're going to combine some of the groups in the histologic_grade and histologic_type columns to make those better to solve.