# Preprocessing Data

## Import Libraries

In [1]:
import csv
import numpy as np
import pandas as pd

tsv_file = open("brca_metabric_clinical_data.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

In [2]:
content=np.array([l for l in read_tsv])
content1=content.copy()
features=content[0,:]
data=content[1:,:]

## Checking if certain features are relevant or not

Certain features such as "sex" or "cancer type" only had one value for all the instances in the data. We remove these features from our dataset. 

In [3]:
## Check the number of missing values before the index 1986
#n_init,d_init=content.shape
#counts=np.zeros(d_init)
#for j in range(d_init):
#    for i in range(1,1986):
#        if content[i,j]=='NA':
#            counts[j]+=1
    #print(j,features[j],counts[j])

In [4]:
## Check if the feature "sex" is relevant
#count_notfemale = 0
#for i in range(1,1986):
#    if content[i,33] != 'Female':
#        count_notfemale += 1
#print(count_notfemale)


In [5]:
## Check if the feature "cancer type" is relevant
count_notbreastcancer = 0
for i in range(1,1986):
    if content[i,5] != 'Breast Cancer':
        count_notbreastcancer += 1
        #print(content[i,5])

## Since breast sarcoma is a very rare kind of breast cancer - 
## we will consider it the same and remove this feature as well

## Number of samples per patient = Primary for all instances
## Sample type is primary for all instances


## Removing unwanted features and indices 

Since we plan to implement supervised learning algorithms, we need the value for overall survival status for all instances. We remove the instances with missing overall survival status. 
We also replace the string 'NA' with the numpy value NaN for easier identification of missing values. 

In [6]:
## Check indices with 'NA' values for overall survival status
indices_remove = np.zeros(4)
j = 0
for i in range(1986):
    if content[i,26] == 'NA':
        indices_remove[j] = i
        j+=1
#print(indices_remove)

In [7]:
## Creating new data 
## first 1986 patient entries
data = content[1:1986,:]
##df=df.drop(index=[i for i in range(n-1) if df['Study ID'].loc[i]!='brca_metabric'])
df = pd.DataFrame(data,columns=features)

## Replace NA with NaN
df.replace('NA',np.nan, inplace=True)


In [8]:
## Removing the unwanted features and indices_remove
df=df.drop(index = indices_remove, columns=['Study ID','Patient ID','Sample ID', 'Cancer Type', 'Number of Samples Per Patient',
                    'Sample Type', 'Sex', 'Tumor Stage', 'Relapse Free Status (Months)'])
features=df.columns
df = df.reset_index()
df = df.drop('index', 1)
n,d=df.shape

## Removing the indices with survival record less than 10 years



In [9]:
## remove survival for less than 10 years
n, d = df.shape
for i in range(n):
    if df['Overall Survival (Months)'].isna()[i]:
        df = df.drop(i)
    elif float(df['Overall Survival (Months)'][i]) < 10*12 and df['Overall Survival Status'][i] == '0:LIVING':
        df = df.drop(i)
    elif float(df['Overall Survival (Months)'][i]) >= 10*12:
        df['Overall Survival Status'][i] = 1

## reset row indexes
df = df.reset_index()
df = df.drop('index', 1)

## replace living for 1 and deceased for -1
df.replace('0:LIVING',1, inplace = True)
df.replace('1:DECEASED',-1, inplace = True)

## remove redundant columns about survival
df = df.drop(['Overall Survival (Months)', 'Patient\'s Vital Status'],1)

## rename column for survival after 6 months
df = df.rename(columns={"Overall Survival Status": "Survival After 10 Years"})

## Replacing missing values for numerical features

The missing values for numerical features are replaced by the average of the feature over all instances 

In [10]:
# find numerical features
num_feat = ['Age at Diagnosis', 'Neoplasm Histologic Grade', 'Lymph nodes examined positive', 'Mutation Count', 
            'Nottingham prognostic index', 'Tumor Size']

k = len(num_feat)
means = np.zeros(k)
#print(df['Age at Diagnosis'][df['Age at Diagnosis'].notna()])
for i in range(k):
    # array of numerical values
    num_values = np.asfarray(df[num_feat[i]][df[num_feat[i]].notna()],float)
    mean = np.mean(num_values)
    # mean of values
    means[i] = mean
    # replace NaN with means
    df[num_feat[i]].replace(np.nan, means[i], inplace=True)


In [11]:
#df_ohe = pd.get_dummies(df[features[1]],prefix = features[1])
## pd.concat
#df_ohe

## Converting textual data to numerical data

We use one-hot encoding for converting textual features with less than 3 values to numerical features. The instances with values among positive/negative, left/right, yes/no, low/medium/high are easy to convert using one-hot encoding. We use Panda's built-in function get.dummies to implement this. 

We use label encoding for all other non-numerical features. 

In [12]:
## Create a list of indices with numerical features 
## Create a list of indices with non-numerical features

## Non numerical features with <= 3 values for one-hot encoding
non_num_feat = ['ER Status', 'Type of Breast Surgery', 'Cellularity', 'Chemotherapy', 'ER status measured by IHC', 
                    'HER2 Status',  'Hormone Therapy', 'Inferred Menopausal State', 'Primary Tumor Laterality',
                    'PR Status', 'Radio Therapy', 'Relapse Free Status']

df = pd.get_dummies(df, prefix = non_num_feat, columns = non_num_feat)

#for i in FeaturesIndices_NonNumericalValues:
#    df[features[i]] = pd.get_dummies(df[features[i]], prefix = features[i], drop_first = True)

In [13]:
## replace feature Integrative Cluster values
df.replace("4ER-", 3.7, inplace = True)
df.replace("4ER+", 4.3,  inplace = True)

In [14]:
## update features
features = df.columns.values

In [15]:
## replace categorical variables with many categories

cat_var = ['Cancer Type Detailed', 'Pam50 + Claudin-low subtype', 'HER2 status measured by SNP6',
               'Tumor Other Histologic Subtype', 'Oncotree Code', '3-Gene classifier subtype']

#set(df['Cancer Type Detailed'].values)

for label in cat_var:
    # set of categories
    cats = set(df[label].values)
    # number of categories
    #k = len(cats)
    # replace the categories in this label
    i = 1
    for cat in cats:
        df[label].replace(cat, i, inplace = True)
        # update count of categories
        i += 1

df.replace(np.nan, 0, inplace = True)
    

In [16]:
df.to_csv("preprocessed_dataset.tsv")
df.to_csv("preprocessed_dataset.csv")

In [17]:
n, d = df.shape

count = 0
for i in range(n):
    if df["Survival After 10 Years"].isin([-1])[i]:
        count+=1
count

758

In [18]:
df['Survival After 10 Years']

0       1
1       1
2       1
3      -1
4      -1
       ..
1706    1
1707   -1
1708    1
1709   -1
1710    1
Name: Survival After 10 Years, Length: 1711, dtype: int64