# Clean data

Normalization and imputing missing values.

In [24]:
import pandas as pd
import numpy as np
from copy import deepcopy

## Load data

In [2]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

In [3]:
inputs = {}
for cancer in cancer_types:
    X = pd.read_csv(f'uncleaned_data/{cancer}_inputs.tsv', sep="\t", index_col=0)
    X.isna().sum(axis=0) / X.shape[0]
    inputs[cancer] = X

In [35]:
targets = {}
for cancer in cancer_types:
    targets[cancer] = pd.read_csv(f'uncleaned_data/{cancer}_targets.tsv', sep="\t", index_col=0)
    
for col in targets['lscc'].columns:
    print(col)
    print(targets['lscc'][col].value_counts(dropna=False).sort_index())
    print()

Recurrence status (1, yes; 0, no)
0.0    91
1.0    19
Name: Recurrence status (1, yes; 0, no), dtype: int64

Survial status (1, dead; 0, alive)
0.0    89
1.0    16
NaN     5
Name: Survial status (1, dead; 0, alive), dtype: int64

histologic_type
Adenosquamous Carcinoma; at least 66% squamous component             1
Adenosquamous carcinoma                                              1
Basaloid squamous cell carcinoma                                     2
Keratinizing squamous cell carcinoma                                18
Non-keratinizing squamous cell carcinoma                            14
Solid adenocarcinoma                                                 1
Spindle cell carcinoma with undifferentiated non small carcinoma     1
Squamous cell carcinoma                                             71
adenosquamous carcinoma                                              1
Name: histologic_type, dtype: int64

pathologic_staging_primary_tumor
t1     19
t2     63
t3     17
t4      2
NaN  

## Input: Impute unknowns and normalize

### Impute numerical column NaNs

We'll just fill with the column mean.

In [5]:
for cancer in cancer_types:
    inputs[cancer] = inputs[cancer].fillna(inputs[cancer].mean(axis=0))
    
inputs['ccrcc']

Unnamed: 0_level_0,tumor_normal_residual_dist_ADCY3,tumor_normal_residual_dist_AGK,tumor_normal_residual_dist_AGXT,tumor_normal_residual_dist_AHSA1,tumor_normal_residual_dist_ALDH18A1,tumor_normal_residual_dist_ANKZF1,tumor_normal_residual_dist_AP3M1,tumor_normal_residual_dist_AP4S1,tumor_normal_residual_dist_APLP2,tumor_normal_residual_dist_APPL1,...,signed_orth_res_USP47,signed_orth_res_USP6NL,signed_orth_res_VPS25,signed_orth_res_WNK1,signed_orth_res_XPNPEP1,signed_orth_res_YARS2,signed_orth_res_ZDHHC2,signed_orth_res_ZEB1,signed_orth_res_ZNF358,signed_orth_res_ZNF397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,4.746706,2.368666,13.190856,8.481922,7.053608,2.121463,1.227607,0.648589,90.507645,1.901647,...,0.010893,0.152850,0.181947,0.149241,0.422377,0.061650,0.370635,0.042980,-0.069231,-0.049632
C3L-00010,4.746706,0.160878,11.688750,11.935946,4.414980,2.268968,2.087734,1.128539,36.071283,5.221817,...,-0.075189,0.055203,0.050951,-0.086946,0.142795,-0.338331,0.370635,0.042980,0.141526,-0.049632
C3L-00011,4.746706,0.914093,3.568109,6.287237,34.092835,7.746025,5.285246,1.872484,5.217191,2.252784,...,-0.261528,-0.312138,-0.777187,0.605387,-0.179915,-0.343415,0.018595,0.042980,0.113805,-0.049632
C3L-00026,4.746706,1.699979,3.824380,3.670214,18.820047,5.686363,1.816493,0.364995,73.821719,4.882907,...,0.088498,0.155157,-0.068772,-0.096996,0.189214,0.779191,0.370635,0.288435,0.141526,-0.049632
C3L-00079,4.746706,0.600205,2.337069,1.420444,9.337463,3.583359,0.263818,1.039790,53.952419,7.534761,...,-0.097382,-0.313189,-0.276598,0.065601,0.052522,0.252166,0.370635,0.195492,0.141526,-0.049632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,4.746706,0.780648,4.981830,7.273306,20.638650,5.568947,1.877280,1.558076,9.978579,5.104328,...,0.057308,-0.252448,-0.027133,0.098906,-0.123295,-0.373050,0.864411,0.042980,0.141526,-0.092570
C3N-01648,4.746706,1.179941,5.613505,0.705244,5.357198,0.381726,2.456724,0.426310,34.261414,4.428208,...,0.078408,-0.588844,0.005994,0.420178,-0.162881,0.089283,0.370635,0.042980,0.615288,-0.049632
C3N-01649,3.768251,0.663157,5.136188,3.093425,1.741500,2.624052,0.315908,0.386628,42.591365,0.622113,...,-0.187441,0.058879,-0.050599,-0.024938,-0.208821,0.035469,0.370635,-0.710171,0.141526,-0.049632
C3N-01651,2.546564,1.107103,10.179527,3.290355,23.025980,14.078832,1.001933,0.323635,71.024819,5.637702,...,0.021833,-0.144407,-0.415184,0.089418,0.032847,0.417992,0.370635,0.201946,0.141526,-0.078130


### Normalize numerical columns

In [6]:
for cancer in cancer_types:
    X_num = inputs[cancer]
    mins = X_num.min(axis=0)
    maxs = X_num.max(axis=0)
    inputs[cancer] = (X_num - mins) / (maxs - mins)

### Save

In [7]:
for cancer in cancer_types:
    inputs[cancer].to_csv(f'clean_data/{cancer}_inputs.tsv', sep="\t")

## Targets

Convert everything to lowercase.

In [7]:
def to_lowercase(y):
    for col in y.columns:
        if y[col].dtype == "O":
            y = y.assign(**{col: y[col].str.lower()})
            
    return y

First we're going to combine the similar groups in histologic_grade.

In [8]:
def combine_grade(y):
    y = y.assign(histologic_grade=y["histologic_grade"].str[:2])
    return y

Also group similar groups in histologic_type.

In [9]:
def combine_type(y): 
    y = y.assign(histologic_type=y["histologic_type"].replace(
        to_replace="adenocarcinoma, acinar predominant ",
        value="acinar adenocarcinoma",
    ).replace(
        to_replace="mixed acinar and micropapillary adenocarcinoma",
        value="adenocarcinoma, micropapillary and acinar",
    ).replace(
        to_replace="lepidic adenocarcinoma, invasive mucinous adenocarcinoma",
        value="lepidic adenocarcinoma",
    ).replace(
        to_replace="adenosquamous carcinoma (approx. 75% adeno, 25% squamous)",
        value="adenosquamous carcinoma",
    ).replace(
        to_replace="squamous cell carcinoma, conventional",
        value="squamous cell carcinoma",
    ).replace(
        to_replace="adenocaricnoma, mixed sub-type (solid and acinar)",
        value="adenocarcinoma, mixed subtype",
    ))
    
    return y

Shorten some column names.

In [10]:
def shorten_cols(y):   
    y.columns = y.columns.to_series().replace(
        "measure_of_success_of_outcome_at_last_available_follow-up",
        "success_last_follow-up",
    ).replace(
        "pathologic_staging_primary_tumor",
        "tumor_stage",
    ).replace(
        "Recurrence status (1, yes; 0, no)",
        "recurrence_status",
    ).replace(
        "Survial status (1, dead; 0, alive)",
        "survival_status",
    )
    
    return y

### Address NaNs

We'll just fill with the column mode.

In [11]:
def fill_nan(y):
    y = y.fillna(y.mode(dropna=True).iloc[0, :])
    return y

### Get cleaned targets

In [12]:
for cancer in cancer_types:
    y = targets[cancer]
    y = to_lowercase(y)
    y = combine_grade(y)
    y = combine_type(y)
    y = shorten_cols(y)
    targets[cancer] = fill_nan(y)

Check the finished product.

In [38]:
for col in targets['luad'].columns:
    print(col)
    print(targets['luad'][col].value_counts(dropna=False).sort_index())
    print()

Recurrence status (1, yes; 0, no)
0.0    89
1.0    22
Name: Recurrence status (1, yes; 0, no), dtype: int64

Survial status (1, dead; 0, alive)
0.0    84
1.0    23
NaN     4
Name: Survial status (1, dead; 0, alive), dtype: int64

histologic_type
Acinar adenocarcinoma                                        25
Acinar adenocarcinoma and papillary adenocarcinoma            1
Adenocarcinoma                                               57
Adenocarcinoma, acinar predominant                            1
Adenocarcinoma, micropapillary and acinar                     1
Adenocarcinoma, mixed subtype                                 1
Adenocaricnoma, mixed sub-type (solid and acinar)             1
Adenosquamous carcinoma                                       1
Adenosquamous carcinoma (approx. 75% adeno, 25% squamous)     1
Colloid adenocarcinoma                                        1
Lepidic adenocarcinoma                                        2
Lepidic adenocarcinoma, Invasive mucinous adenocar

### Remove patients from targets that are not in inputs

In [22]:
for cancer in cancer_types:
    ind = inputs[cancer].index.tolist()
    targets[cancer] = targets[cancer].loc[ind]

Save the targets.

In [23]:
for cancer in cancer_types:
    targets[cancer].to_csv(f'clean_data/{cancer}_targets.tsv', sep="\t")