# MLP classifier

Let's set a baseline accuracy with the default MLP, and then do some optimization.

In [3]:
import pandas as pd
import sklearn.neural_network

## Load data

In [19]:
X = pd.read_csv("inputs.tsv", sep="\t", index_col=0)
X.isna().sum(axis=0) / X.shape[0]

cancer_type                            0.000000
tumor_normal_residuals_corr            0.000000
prot_RNA_tumor_normal_ratios_corr      0.000000
above_reg_line_ACSS1_Normal            0.000000
above_reg_line_ACSS1_Tumor             0.000000
above_reg_line_C1orf116_Normal         0.417178
above_reg_line_C1orf116_Tumor          0.417178
above_reg_line_CARD9_Normal            0.288344
above_reg_line_CARD9_Tumor             0.288344
above_reg_line_CDKN2A_Normal           0.230061
above_reg_line_CDKN2A_Tumor            0.230061
above_reg_line_CFI_Normal              0.957055
above_reg_line_CFI_Tumor               0.957055
above_reg_line_DYNC1LI1_Normal         0.539877
above_reg_line_DYNC1LI1_Tumor          0.539877
above_reg_line_ECH1_Normal             0.128834
above_reg_line_ECH1_Tumor              0.128834
above_reg_line_ECT2_Normal             0.000000
above_reg_line_ECT2_Tumor              0.000000
above_reg_line_FAM57A_Normal           0.481595
above_reg_line_FAM57A_Tumor            0

In [20]:
y = pd.read_csv("targets.tsv", sep="\t", index_col=0)
for col in y.columns:
    print(col)
    print(y[col].value_counts(dropna=False).sort_index())
    print()

Recurrence status (1, yes; 0, no)
0.0    265
1.0     61
Name: Recurrence status (1, yes; 0, no), dtype: int64

Survial status (1, dead; 0, alive)
0.0    260
1.0     57
NaN      9
Name: Survial status (1, dead; 0, alive), dtype: int64

histologic_grade
G1 Well differentiated                                                                            25
G1: Nuclei round, uniform, approximately 10µm; nucleoli inconspicuous                              3
G2 Moderately differentiated                                                                     135
G2: Nuclei slightly irregular, approximately 15µm; nucleoli evident                               35
G3 Poorly differentiated                                                                          89
G3: Nuclei very irregular, approximately 20µm; nucleoli large and prominent                       28
G4: Nuclei bizarre and multilobulated, 20µm or greater; nucleoli prominent, chromatin clumped      9
GX Grade cannot be assessed              

## Impute unknowns and normalize

### Split table

Since different column types require different operations, we're going to split the dataframe based on column type.

In [37]:
cat_cols = ["cancer_type"] # Categorical
bool_cols = X.columns[X.columns.str.startswith("above_reg_line_")] # Boolean
num_cols = X.columns[X.columns.str.startswith("tumor_normal_residual_diff_")].tolist() + [
    "tumor_normal_residuals_corr", "prot_RNA_tumor_normal_ratios_corr"
] # Numerical

In [38]:
X_cat = X[cat_cols]
X_bool = X[bool_cols]
X_num = X[num_cols]

### Categorical column is just cancer type, doesn't need any work

In [43]:
X_cat.value_counts(dropna=False)

cancer_type
luad           101
lscc            94
ccrcc           75
hnscc           42
endometrial     14
dtype: int64

### Address boolean columns

We'll just fill NaN with the mode. No normalization necessary.

In [44]:
X_bool.mode()

Unnamed: 0,above_reg_line_ACSS1_Normal,above_reg_line_ACSS1_Tumor,above_reg_line_C1orf116_Normal,above_reg_line_C1orf116_Tumor,above_reg_line_CARD9_Normal,above_reg_line_CARD9_Tumor,above_reg_line_CDKN2A_Normal,above_reg_line_CDKN2A_Tumor,above_reg_line_CFI_Normal,above_reg_line_CFI_Tumor,...,above_reg_line_SLC1A5_Normal,above_reg_line_SLC1A5_Tumor,above_reg_line_TALDO1_Normal,above_reg_line_TALDO1_Tumor,above_reg_line_THADA_Normal,above_reg_line_THADA_Tumor,above_reg_line_TXLNG_Normal,above_reg_line_TXLNG_Tumor,above_reg_line_USP7_Normal,above_reg_line_USP7_Tumor
0,False,False,False,False,True,True,True,True,False,True,...,True,True,False,False,False,False,True,True,False,True
1,,,,,,,,,True,,...,,,,,,,,,,


## Baseline accuracy

In [4]:
sklearn.neural_network.MLPClassifier

sklearn.neural_network._multilayer_perceptron.MLPClassifier