# Clean data

Normalization and imputing missing values.

In [1]:
import pandas as pd
import numpy as np

## Load data

In [2]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

In [7]:
all_data = {}
for cancer in cancer_types:
    X = pd.read_csv(f'{cancer}_inputs.tsv', sep="\t", index_col=0)
    X.isna().sum(axis=0) / X.shape[0]
    all_data[cancer] = X

FileNotFoundError: [Errno 2] No such file or directory: 'ccrcc_inputs.tsv'

In [18]:
pd.read_csv('hnscc_inputs.tsv', sep="\t", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'Users/Bryn/PayneLab/pancancer/deltaCorrML/hnscc_inputs.tsv'

In [3]:
y = pd.read_csv("targets.tsv", sep="\t", index_col=0)
for col in y.columns:
    print(col)
    print(y[col].value_counts(dropna=False).sort_index())
    print()

Recurrence status (1, yes; 0, no)
0.0    265
1.0     61
Name: Recurrence status (1, yes; 0, no), dtype: int64

Survial status (1, dead; 0, alive)
0.0    260
1.0     57
NaN      9
Name: Survial status (1, dead; 0, alive), dtype: int64

histologic_grade
G1 Well differentiated                                                                            25
G1: Nuclei round, uniform, approximately 10µm; nucleoli inconspicuous                              3
G2 Moderately differentiated                                                                     135
G2: Nuclei slightly irregular, approximately 15µm; nucleoli evident                               35
G3 Poorly differentiated                                                                          89
G3: Nuclei very irregular, approximately 20µm; nucleoli large and prominent                       28
G4: Nuclei bizarre and multilobulated, 20µm or greater; nucleoli prominent, chromatin clumped      9
GX Grade cannot be assessed              

## Input: Impute unknowns and normalize

### Split table

Since different column types require different operations, we're going to split the dataframe based on column type.

In [4]:
cat_cols = ["cancer_type"] # Categorical
bool_cols = X.columns[X.columns.str.startswith("above_reg_line_")] # Boolean
num_cols = X.columns[X.columns.str.startswith("tumor_normal_residual_diff_")].tolist() + [
    "tumor_normal_residuals_corr", "prot_RNA_tumor_normal_ratios_corr"
] # Numerical

In [5]:
X_cat = X[cat_cols].copy()
X_bool = X[bool_cols].copy()
X_num = X[num_cols].copy()

### Categorical column is just cancer type, doesn't need any work

In [6]:
X_cat.value_counts(dropna=False)

cancer_type
luad           101
lscc            94
ccrcc           75
hnscc           42
endometrial     14
dtype: int64

### Impute boolean column NaNs

Fill NaNs in boolean columns with a random array of trues and falses, with the same proportion of trues and falses as the original non-NaN cells.

In [7]:
# Select column with tied mode
for col in bool_cols:
    
    # Get count of nans to fill
    nan_ct = X_bool[col].isna().sum()
    
    # Find the proportion of trues and falses in the original non-NaN values
    true_prop = X_bool[col].mean()
    
    # Create an array that size with the same proportion of trues and falses
    true_ct = round(true_prop * nan_ct)
    false_ct = nan_ct - true_ct
    fill_arr = np.append(np.full(true_ct, True), np.full(false_ct, False))
    
    # Shuffle the array
    rng = np.random.default_rng(0)
    rng.shuffle(fill_arr)
    
    # Assign to the column
    X_bool.loc[X_bool[col].isna(), col] = fill_arr

### Impute numerical column NaNs

We'll just fill with the column mean.

In [8]:
X_num = X_num.fillna(X_num.mean(axis=0))

### Normalize numerical columns

In [9]:
mins = X_num.min(axis=0)
maxs = X_num.max(axis=0)
X_num = (X_num - mins) / (maxs - mins)

### Join tables and save

In [10]:
X_joined = X_cat.join(X_bool).join(X_num)
X_joined.to_csv("inputs_cleaned.tsv", sep="\t")

## Targets

Convert everything to lowercase.

In [11]:
for col in y.columns:
    if y[col].dtype == "O":
        y = y.assign(**{col: y[col].str.lower()})

First we're going to combine the similar groups in histologic_grade.

In [12]:
y = y.assign(histologic_grade=y["histologic_grade"].str[:2])

Also group similar groups in histologic_type.

In [13]:
y = y.assign(histologic_type=y["histologic_type"].replace(
    to_replace="adenocarcinoma, acinar predominant ",
    value="acinar adenocarcinoma",
).replace(
    to_replace="mixed acinar and micropapillary adenocarcinoma",
    value="adenocarcinoma, micropapillary and acinar",
).replace(
    to_replace="lepidic adenocarcinoma, invasive mucinous adenocarcinoma",
    value="lepidic adenocarcinoma",
).replace(
    to_replace="adenosquamous carcinoma (approx. 75% adeno, 25% squamous)",
    value="adenosquamous carcinoma",
).replace(
    to_replace="squamous cell carcinoma, conventional",
    value="squamous cell carcinoma",
).replace(
    to_replace="adenocaricnoma, mixed sub-type (solid and acinar)",
    value="adenocarcinoma, mixed subtype",
))

Shorten some column names.

In [14]:
y.columns = y.columns.to_series().replace(
    "measure_of_success_of_outcome_at_last_available_follow-up",
    "success_last_follow-up",
).replace(
    "pathologic_staging_primary_tumor",
    "tumor_stage",
).replace(
    "Recurrence status (1, yes; 0, no)",
    "recurrence_status",
).replace(
    "Survial status (1, dead; 0, alive)",
    "survival_status",
)

### Address NaNs

We'll just fill with the column mode.

In [15]:
y = y.fillna(y.mode(dropna=True).iloc[0, :])

Check the finished product.

In [16]:
for col in y.columns:
    print(col)
    print(y[col].value_counts(dropna=False).sort_index())
    print()

recurrence_status
0.0    265
1.0     61
Name: recurrence_status, dtype: int64

survival_status
0.0    269
1.0     57
Name: survival_status, dtype: int64

histologic_grade
g1     28
g2    170
g3    117
g4      9
gx      2
Name: histologic_grade, dtype: int64

histologic_type
acinar adenocarcinoma                                  25
acinar adenocarcinoma and papillary adenocarcinoma      1
adenocarcinoma                                         53
adenocarcinoma with neuroendocrine differentiation      1
adenocarcinoma, micropapillary and acinar               2
adenocarcinoma, mixed subtype                           2
adenosquamous carcinoma                                 5
basaloid squamous cell carcinoma                        1
clear cell renal cell carcinoma                        73
colloid adenocarcinoma                                  1
endometrioid carcinoma                                 12
keratinizing squamous cell carcinoma                   17
lepidic adenocarcinoma       

Save the targets.

In [17]:
y.to_csv("targets_cleaned.tsv", sep="\t")