# MLPClassifier

In [1]:
import altair as alt
import numpy as np
import pandas as pd
import pprint
import sklearn.metrics
import sklearn.model_selection
import sklearn.neural_network
import copy

## Load data

In [2]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

In [4]:
inputs = {}
targets = {}
for cancer in cancer_types:
    inputs[cancer] = pd.read_csv(f'clean_data/{cancer}_inputs.tsv', sep="\t", index_col=0)
    targets[cancer] = pd.read_csv(f'clean_data/{cancer}_targets.tsv', sep="\t", index_col=0)

## Targets: One-hot encode and split into a map of tables so we can do one target at a time

In [6]:
target_cols = {}

for cancer in cancer_types:
    ys = {}
    y = targets[cancer]
    
    for col in y.columns:
        ys[col] = pd.get_dummies(y[col])
        
    target_cols[cancer] = ys

## Baseline accuracy

In [5]:
def baseline(X, ys):
    
    results = {}
    for target, y in ys.items():
        mlp = sklearn.neural_network.MLPClassifier(
            hidden_layer_sizes=(16,),
            random_state=0,
        )
        results[target] = sklearn.model_selection.cross_validate(
            mlp,
            X,
            y,
            cv=10,
            scoring={
                "accuracy": "accuracy",
                "precision": sklearn.metrics.make_scorer(
                    sklearn.metrics.precision_score,
                    average="samples",
                    zero_division=0
                ),
                "recall": sklearn.metrics.make_scorer(
                    sklearn.metrics.recall_score,
                    average="samples",
                    zero_division=0
                ),
            },
            n_jobs=-1,
        )
        
    scores = pd.DataFrame(results).\
    T[["test_accuracy", "test_precision", "test_recall"]].\
    applymap(np.mean).T
    
    scores.index.name = "metric"
    chart_df = scores.reset_index().melt(
        id_vars="metric",
        var_name="target",
        value_name="score",
    )
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x="target",
        y=alt.Y(
            "score",
            scale=alt.Scale(
                domain=[0, 1]
            )
        ),
        color="target",
        column="metric"
    )
    
    scores.columns.name = "target"
    scores.index.name = None
    scores = scores.T.sort_index()
        
    return scores, chart

In [6]:
baseline_scores, baseline_chart = baseline(X, ys)

In [7]:
baseline_scores

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.330871,0.337121,0.343371
histologic_type,0.194886,0.194886,0.194886
recurrence_status,0.77017,0.773295,0.77642
success_last_follow-up,0.257008,0.258523,0.260038
survival_status,0.806723,0.811316,0.815909
tumor_stage,0.357765,0.376136,0.394508


In [8]:
baseline_chart

## Optimize

In [9]:
optimized_scores_list = [baseline_scores]
optimized_params_list = ["defaults"]
optimized_charts_list = [baseline_chart]

In [34]:
def random_search(
    X, 
    ys,
    n_iter,
    mlp_max_iter,
    optimized_scores_list,
    optimized_params_list,
    optimized_charts_list,
    hidden_layer_width_range=None,
    num_hidden_layers_range=None,
    activation_funcs=None,
    alpha_range=None,
    learning_rate_range=None,
    momentum_range=None,
    set_params={},
    target_specific_params={},
):
    
    # Generate our parameter distributions
    dist_size = 1000
    rng = np.random.default_rng(0)
    param_distributions = {}
    
    if activation_funcs is not None:
        param_distributions["activation"] = activation_funcs
    
    if alpha_range is not None:
        param_distributions["alpha"] = rng.uniform(*alpha_range, dist_size)
        
    if hidden_layer_width_range is not None and num_hidden_layers_range is not None:
        
        def get_hidden_layers_dist(hidden_layer_width_range, num_hidden_layers_range, size):
            rng = np.random.default_rng(0)
            dist = []
            for i in range(size):
                dist.append((rng.integers(*hidden_layer_width_range),) * rng.integers(*num_hidden_layers_range))
            return dist

        param_distributions["hidden_layer_sizes"] = get_hidden_layers_dist(
            hidden_layer_width_range=hidden_layer_width_range,
            num_hidden_layers_range=num_hidden_layers_range,
            size=dist_size
        )
    else:
        if "hidden_layer_sizes" not in set_params.keys():
            set_params["hidden_layer_sizes"] = (16,)

    if learning_rate_range is not None:
        param_distributions["learning_rate_init"] = rng.uniform(*learning_rate_range, dist_size)
    
    if momentum_range is not None:
        param_distributions["momentum"] = rng.uniform(*momentum_range, dist_size)
    
    scores = {}
    params = {}
    for target, y in ys.items():
        
        mlp_params = copy.deepcopy(set_params)
        mlp_params.update(target_specific_params.get(target, {}))
        
        mlp = sklearn.neural_network.MLPClassifier(
            max_iter=mlp_max_iter,
            random_state=0,
            **mlp_params,
        )
        search = sklearn.model_selection.RandomizedSearchCV(
            estimator=mlp,
            param_distributions=param_distributions,
            n_iter=n_iter,
            random_state=0,
            cv=10,
            n_jobs=-1, # Use all processors
            scoring={
                "accuracy": "accuracy",
                "precision": sklearn.metrics.make_scorer(
                    sklearn.metrics.precision_score,
                    average="samples",
                    zero_division=0
                ),
                "recall": sklearn.metrics.make_scorer(
                    sklearn.metrics.recall_score,
                    average="samples",
                    zero_division=0
                ),
            },
            refit="accuracy",
        ).fit(X, y)
        
        scores[target] = search.cv_results_
        params[target] = search.best_params_
    
    # Extract the overall scores
    scores_list = []
    for target, tscores in scores.items():
        df = pd.DataFrame(tscores)[
            ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall']
        ]
        df = df.mean(axis=0).to_frame()
        df.columns = [target]
        scores_list.append(df)
    
    scores_df = pd.DataFrame()
    for df in scores_list:
        scores_df = scores_df.join(df, how="outer")
        
    # Make a chart
    scores_df.index.name = "metric"
    chart_df = scores_df.reset_index().melt(
        id_vars="metric",
        var_name="target",
        value_name="score",
    )
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x="target",
        y=alt.Y(
            "score",
            scale=alt.Scale(
                domain=[0, 1]
            )
        ),
        color="target",
        column="metric"
    )
    
    scores_df.columns.name = "target"
    scores_df.index.name = None
    scores_df = scores_df.T.sort_index()
    
    optimized_scores_list.append(scores_df)
    optimized_params_list.append(params)
    optimized_charts_list.append(chart)


In [11]:
random_search(
    X, 
    ys,
    n_iter=10,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    hidden_layer_width_range=(2, 32),
    num_hidden_layers_range=(1, 10),
    activation_funcs=["identity", "logistic", "tanh", "relu"],
    alpha_range=(1e-6, 1),
    learning_rate_range=(1e-3, 0.5),
    momentum_range=(0.1, 1),
)



## Only optimize one parameter at a time

Activation function

In [12]:
random_search(
    X, 
    ys,
    n_iter=10,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    activation_funcs=["identity", "logistic", "tanh", "relu"],
)













Number of hidden layers and widths

In [14]:
random_search(
    X, 
    ys,
    n_iter=100,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    hidden_layer_width_range=(2, 32),
    num_hidden_layers_range=(1, 10),
    set_params={
        "activation": "logistic"
    },
)

































































































































































































































Learning rate

In [36]:
random_search(
    X, 
    ys,
    n_iter=100,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    learning_rate_range=(1e-3, 0.5),
    set_params={
        "activation": "logistic"
    },
    target_specific_params={
        'histologic_grade': {
            'hidden_layer_sizes': (12, 12, 12)
        },
        'histologic_type': {
        },
        'recurrence_status': {
            'hidden_layer_sizes': (22,)
        },
        'success_last_follow-up': {
            'hidden_layer_sizes': (24, 24, 24, 24, 24, 24, 24)
        },
        'survival_status': {
            'hidden_layer_sizes': (22,)
        },
        'tumor_stage': {
        }
    },
)





















Momentum

In [45]:
random_search(
    X, 
    ys,
    n_iter=100,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    momentum_range=(0.1, 1),
    set_params={
        "activation": "logistic"
    },
    target_specific_params={
        'histologic_grade': {
            'hidden_layer_sizes': (12, 12, 12)
        },
        'histologic_type': {
        },
        'recurrence_status': {
            'hidden_layer_sizes': (22,)
        },
        'success_last_follow-up': {
            'hidden_layer_sizes': (24, 24, 24, 24, 24, 24, 24)
        },
        'survival_status': {
            'hidden_layer_sizes': (22,)
        },
        'tumor_stage': {
        }
    },
)















































































































































































































































































































Alpha

In [78]:
random_search(
    X, 
    ys,
    n_iter=100,
    mlp_max_iter=200,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list,
    alpha_range=(1e-6, 1),
    set_params={
        "activation": "logistic"
    },
    target_specific_params={
        'histologic_grade': {
            'hidden_layer_sizes': (12, 12, 12),
            'momentum': 0.1193679563027144,
        },
        'histologic_type': {
            "activation": "relu",
        },
        'recurrence_status': {
            'hidden_layer_sizes': (22,)
        },
        'success_last_follow-up': {
            'hidden_layer_sizes': (24, 24, 24, 24, 24, 24, 24),
            'momentum': 0.1193679563027144
        },
        'survival_status': {
            'hidden_layer_sizes': (22,),
            'momentum': 0.1193679563027144
        },
        'tumor_stage': {
            'momentum': 0.1193679563027144,
        }
    },
)













































































































































































































































































































## Look at the results of our searches

In [15]:
optimized_scores_list[0]

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.330871,0.337121,0.343371
histologic_type,0.194886,0.194886,0.194886
recurrence_status,0.77017,0.773295,0.77642
success_last_follow-up,0.257008,0.258523,0.260038
survival_status,0.806723,0.811316,0.815909
tumor_stage,0.357765,0.376136,0.394508


In [16]:
optimized_scores_list[1]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.366856,0.382236,0.40125
histologic_type,0.141127,0.144062,0.159867
recurrence_status,0.765331,0.771108,0.776884
success_last_follow-up,0.308542,0.320766,0.347282
survival_status,0.799015,0.800545,0.802074
tumor_stage,0.205881,0.217036,0.228191


In [17]:
optimized_scores_list[2]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.356439,0.36031,0.364181
histologic_type,0.127154,0.127154,0.127154
recurrence_status,0.778527,0.781226,0.783925
success_last_follow-up,0.30599,0.306368,0.306747
survival_status,0.795099,0.796626,0.798153
tumor_stage,0.319957,0.328776,0.337595


In [32]:
optimized_scores_list[3]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.500896,0.500896,0.500896
histologic_type,0.0,0.0,0.0
recurrence_status,0.812848,0.812848,0.812848
success_last_follow-up,0.431542,0.431542,0.431542
survival_status,0.825223,0.825223,0.825223
tumor_stage,0.035501,0.035501,0.035501


In [18]:
optimized_scores_list[0].columns = ["mean_test_accuracy", "mean_test_precision", "mean_test_recall"]

In [19]:
(optimized_scores_list[1] > optimized_scores_list[0]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,True,True,True
histologic_type,,,
recurrence_status,,,True
success_last_follow-up,True,True,True
survival_status,,,
tumor_stage,,,


In [20]:
(optimized_scores_list[2] > optimized_scores_list[0]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,True,True,True
histologic_type,,,
recurrence_status,True,True,True
success_last_follow-up,True,True,True
survival_status,,,
tumor_stage,,,


In [29]:
(optimized_scores_list[3] > optimized_scores_list[0]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,True,True,True
histologic_type,,,
recurrence_status,True,True,True
success_last_follow-up,True,True,True
survival_status,True,True,True
tumor_stage,,,








In [41]:
(optimized_scores_list[4] > optimized_scores_list[0]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,True,True,True
histologic_type,,,
recurrence_status,True,True,True
success_last_follow-up,True,True,True
survival_status,True,True,True
tumor_stage,,,


In [84]:
(optimized_scores_list[6] > optimized_scores_list[5]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,,,
histologic_type,True,True,True
recurrence_status,,,
success_last_follow-up,,,
survival_status,,,
tumor_stage,,,


In [83]:
optimized_scores_list[6] - optimized_scores_list[5]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.0,0.0,0.0
histologic_type,0.200424,0.200424,0.200424
recurrence_status,0.0,0.0,0.0
success_last_follow-up,-0.007273,-0.007273,-0.007273
survival_status,0.0,0.0,0.0
tumor_stage,-0.025194,-0.025194,-0.025194


In [53]:
alt.hconcat(*optimized_charts_list)



In [82]:
pprint.pprint(optimized_params_list[6])

{'histologic_grade': {'alpha': 0.02152092992750899},
 'histologic_type': {'alpha': 0.3623085881766186},
 'recurrence_status': {'alpha': 0.02152092992750899},
 'success_last_follow-up': {'alpha': 0.02152092992750899},
 'survival_status': {'alpha': 0.02152092992750899},
 'tumor_stage': {'alpha': 0.02152092992750899}}


In [24]:
print(baseline_scores.round(4).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  mean\_test\_accuracy &  mean\_test\_precision &  mean\_test\_recall \\
target                 &                     &                      &                   \\
\midrule
histologic\_grade       &              0.3309 &               0.3371 &            0.3434 \\
histologic\_type        &              0.1949 &               0.1949 &            0.1949 \\
recurrence\_status      &              0.7702 &               0.7733 &            0.7764 \\
success\_last\_follow-up &              0.2570 &               0.2585 &            0.2600 \\
survival\_status        &              0.8067 &               0.8113 &            0.8159 \\
tumor\_stage            &              0.3578 &               0.3761 &            0.3945 \\
\bottomrule
\end{tabular}



In [25]:
print(optimized_scores_list[1].round(4).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  mean\_test\_accuracy &  mean\_test\_precision &  mean\_test\_recall \\
target                 &                     &                      &                   \\
\midrule
histologic\_grade       &              0.3669 &               0.3822 &            0.4012 \\
histologic\_type        &              0.1411 &               0.1441 &            0.1599 \\
recurrence\_status      &              0.7653 &               0.7711 &            0.7769 \\
success\_last\_follow-up &              0.3085 &               0.3208 &            0.3473 \\
survival\_status        &              0.7990 &               0.8005 &            0.8021 \\
tumor\_stage            &              0.2059 &               0.2170 &            0.2282 \\
\bottomrule
\end{tabular}



In [114]:
def final(X, ys, set_params={}, target_specific_params={},):
    
    results = {}
    for target, y in ys.items():
        
        mlp_params = copy.deepcopy(set_params)
        mlp_params.update(target_specific_params.get(target, {}))
        
        mlp = sklearn.neural_network.MLPClassifier(
            random_state=0,
            **mlp_params,
        )
        results[target] = sklearn.model_selection.cross_validate(
            mlp,
            X,
            y,
            cv=10,
            scoring={
                "accuracy": "accuracy",
                "precision": sklearn.metrics.make_scorer(
                    sklearn.metrics.precision_score,
                    average="samples",
                    zero_division=0
                ),
                "recall": sklearn.metrics.make_scorer(
                    sklearn.metrics.recall_score,
                    average="samples",
                    zero_division=0
                ),
            },
            n_jobs=-1,
        )
        
    scores = pd.DataFrame(results).\
    T[["test_accuracy", "test_precision", "test_recall"]].\
    applymap(np.mean).T
    
    scores.index.name = "metric"
    chart_df = scores.reset_index().melt(
        id_vars="metric",
        var_name="target",
        value_name="score",
    )
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x="target",
        y=alt.Y(
            "score",
            scale=alt.Scale(
                domain=[0, 1]
            )
        ),
        color="target",
        column="metric"
    )
    
    scores.columns.name = "target"
    scores.index.name = None
    scores = scores.T.sort_index()
        
    return scores, chart

final_scores, final_chart = final(
    X, 
    ys,
    set_params={
        "activation": "logistic",
        "max_iter": 1000,
    },
    target_specific_params={
        'histologic_grade': {
            'hidden_layer_sizes': (12, 12, 12),
            'momentum': 0.1193679563027144,
        },
        'histologic_type': {
            "activation": "relu",
            'alpha': 0.3623085881766186,
        },
        'recurrence_status': {
#             'hidden_layer_sizes': (22,),
#             'alpha': 0.3623085881766186,
            "activation": "relu",
            "max_iter": 200,
        },
        'success_last_follow-up': {
            'hidden_layer_sizes': (24, 24, 24, 24, 24, 24, 24),
            'momentum': 0.1193679563027144,
        },
        'survival_status': {
#             'hidden_layer_sizes': (22,),
#             'momentum': 0.1193679563027144,
#             'alpha': 0.3623085881766186,
            "activation": "relu",
            "max_iter": 200,
        },
        'tumor_stage': {
            'momentum': 0.1193679563027144,
        }
    },
)

In [115]:
final_scores

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.521496,0.521496,0.521496
histologic_type,0.593371,0.600994,0.608617
recurrence_status,0.754735,0.759328,0.76392
success_last_follow-up,0.517803,0.517803,0.517803
survival_status,0.772727,0.78035,0.787973
tumor_stage,0.366288,0.409233,0.452178


In [116]:
final_chart

In [117]:
print(final_scores.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  test\_accuracy &  test\_precision &  test\_recall \\
target                 &                &                 &              \\
\midrule
histologic\_grade       &       0.521496 &        0.521496 &     0.521496 \\
histologic\_type        &       0.593371 &        0.600994 &     0.608617 \\
recurrence\_status      &       0.754735 &        0.759328 &     0.763920 \\
success\_last\_follow-up &       0.517803 &        0.517803 &     0.517803 \\
survival\_status        &       0.772727 &        0.780350 &     0.787973 \\
tumor\_stage            &       0.366288 &        0.409233 &     0.452178 \\
\bottomrule
\end{tabular}



In [118]:
final_scores

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.521496,0.521496,0.521496
histologic_type,0.593371,0.600994,0.608617
recurrence_status,0.754735,0.759328,0.76392
success_last_follow-up,0.517803,0.517803,0.517803
survival_status,0.772727,0.78035,0.787973
tumor_stage,0.366288,0.409233,0.452178


In [119]:
baseline_scores

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,0.330871,0.337121,0.343371
histologic_type,0.194886,0.194886,0.194886
recurrence_status,0.77017,0.773295,0.77642
success_last_follow-up,0.257008,0.258523,0.260038
survival_status,0.806723,0.811316,0.815909
tumor_stage,0.357765,0.376136,0.394508


In [120]:
final_scores.columns = ["mean_test_accuracy", "mean_test_precision", "mean_test_recall"]
(final_scores >= optimized_scores_list[0]).replace(False, "")

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histologic_grade,True,True,True
histologic_type,True,True,True
recurrence_status,,,
success_last_follow-up,True,True,True
survival_status,,,
tumor_stage,True,True,True








