## Breast Cancer Study - Preprocessing and Training Data Development

The data from the NKI breast cancer dataset will be prepared below for fitting models.

In [1]:
#load necessary packages
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.getcwd()

'/Users/shannonballard/Springboard/Springboard_Capstone_2'

In [3]:
path="/Users/shannonballard/Springboard/Springboard_Capstone_2"
os.chdir(path) 

In [4]:
#load the data into pandas df and print the first five rows
nki_bc_cleaned = pd.read_csv('nki_bc_cleaned.csv', index_col = 0)
nki_bc_cleaned.head()

Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.591103,-0.355018,0.373644,-0.76069,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,0.328736,-0.047571,0.084228,-0.69595,-0.40284,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.648861,-0.039088,0.182182,-0.52464,0.03732,-0.167688,-0.01679,-0.285344,-0.251188,0.86271
4,38,0,6.436687,6.31896,0,0,1,1,15,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.13416


In [5]:
nki_bc_cleaned.describe()

Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
count,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,...,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0
mean,44.047794,0.283088,8.080609,7.250433,0.393382,0.132353,0.441176,1.110294,22.529412,1.341912,...,-0.137075,-0.049257,-0.109779,-0.222577,-0.116336,-0.015141,-0.057222,-0.038441,-0.240416,-0.252884
std,5.464538,0.451329,3.904874,4.177462,0.489401,0.339499,0.497443,0.545668,8.703345,2.108848,...,0.414608,0.248084,0.434717,0.421388,0.274243,0.323474,0.297915,0.24602,0.482092,0.472775
min,26.0,0.0,0.711841,0.271047,0.0,0.0,0.0,1.0,2.0,0.0,...,-1.071389,-0.641482,-1.363426,-1.252298,-0.841332,-2.0,-0.871728,-0.593263,-1.67947,-2.0
25%,40.75,0.0,5.499738,4.389459,0.0,0.0,0.0,1.0,15.0,0.0,...,-0.462427,-0.219615,-0.247377,-0.567609,-0.271186,-0.099996,-0.253693,-0.185253,-0.579311,-0.554709
50%,45.0,0.0,7.359343,6.950034,0.0,0.0,0.0,1.0,20.0,0.0,...,-0.098875,-0.055215,0.008589,-0.223481,-0.137765,-0.002994,-0.052311,-0.07193,-0.213541,-0.254435
75%,49.0,1.0,10.512662,9.986311,1.0,0.0,1.0,1.0,29.25,2.0,...,0.147836,0.119754,0.162106,0.072289,-0.022088,0.074571,0.166774,0.100221,0.116691,0.051704
max,53.0,1.0,18.340862,18.340862,1.0,1.0,1.0,7.0,50.0,13.0,...,0.739614,0.635155,1.009289,1.324539,1.173276,2.0,0.59188,0.99396,1.298576,0.988492


### Categorical Features

If the dataset contains categorical features, dummy features will be created for future model development.

Note that there are several categorical columns that are represented with either 0 or 1 values (eventdeath, chemo, hormonal, and amputation). Dummy variables will not be generated for these columns, as they are already given values of 0 or 1.

In [6]:
# Find columns that could be categorical
# Using 'int64' because values in columns are integers and not strings
nki_bc_cleaned.select_dtypes(include=['int64'])

Unnamed: 0,age,eventdeath,chemo,hormonal,amputation,histtype,diam,posnodes,grade,angioinv,lymphinfil
0,43,0,0,0,1,1,25,0,2,3,1
1,48,0,0,0,0,1,20,0,3,3,1
2,38,0,0,0,0,1,15,0,2,1,1
3,50,0,0,1,0,1,15,1,2,3,1
4,38,0,0,0,1,1,15,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...
267,48,1,1,0,1,1,30,0,3,1,3
268,39,1,0,0,1,1,30,0,2,1,1
269,50,1,0,0,1,1,27,0,3,1,1
270,52,1,0,1,1,1,28,0,3,1,1


## Description of Columns

| Variable |Details| Type |
| --- | --- | --- |
|age | Age at which patient was diagnosed with breast cancer | Continuous |
|eventdeath | 0 = alive, 1 = death | Categorical |
|survival | Time (in years) until death or last follow-up | Continuous |
|timerecurrence | Time (in years) until cancer recurrence or last follow-up | Continuous |
|chemo | chemotherapy used (yes=1/no=0) | Categorical |
|hormonal | Hormonal therapy used (yes=1/no=0) | Categorical |
|amputation | Mastectomy (yes = 1/no = 0) | Categorical |
|histtype | Histological grade based on 3 morphological features | Categorical |
|diam | Diameter of primary tumor | Continuous |
|posnodes | number of lymph nodes that contained cancerous cells | Continuous |
|grade | Pathological grade based on cell differentiation & growth rate (1=low, 2=intermediate, 3=high) | Categorical |
|angioinv | Vascular invasion 1= absent, 2= minor, 3 = major | Categorical |
|lymphinfil | level of lymphocytic infiltration | Categorical |
|1,554 gene expression levels | each gene is provided as an individual variable; given as an intensity ratio to that of reference pool | Continuous |

In [7]:
# Identify the unique values for particular categorical columns to see if they should be considered for dummy variables
columns = ['histtype', 'grade', 'angioinv', 'lymphinfil']

for column in columns:
    unique_values = nki_bc_cleaned[column].unique()
    print('The unique values for ', column, 'are: ', unique_values)

The unique values for  histtype are:  [1 2 5 7 4]
The unique values for  grade are:  [2 3 1]
The unique values for  angioinv are:  [3 1 2]
The unique values for  lymphinfil are:  [1 2 3]


In [8]:
columns = ['histtype', 'grade', 'angioinv', 'lymphinfil']

for column in columns:
    value_count = nki_bc_cleaned[column].value_counts()
    print('The value counts for ', column, 'are: \n', value_count)

The value counts for  histtype are: 
 1    254
2     14
4      2
7      1
5      1
Name: histtype, dtype: int64
The value counts for  grade are: 
 3    106
2     95
1     71
Name: grade, dtype: int64
The value counts for  angioinv are: 
 1    169
3     73
2     30
Name: angioinv, dtype: int64
The value counts for  lymphinfil are: 
 1    223
2     27
3     22
Name: lymphinfil, dtype: int64


### Dummy Variables

Dummy variables will be made for all of the above columns except histtype. Because most of the values are 1 and because the information for the numbers in this column, dummy variables will not be made for this.

In [9]:
# Make dummy variables for categorical columns histtype, grade, angioinv, and lymphinfil
nki_bc_dummies = pd.get_dummies(nki_bc_cleaned, prefix=['grade', 'angioinv', 'lymphinfil'], columns=['grade', 'angioinv', 'lymphinfil'])
nki_bc_dummies.head()

Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,AF067420,grade_1,grade_2,grade_3,angioinv_1,angioinv_2,angioinv_3,lymphinfil_1,lymphinfil_2,lymphinfil_3
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.153795,0,1,0,0,0,1,1,0,0
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.014098,0,0,1,0,0,1,1,0,0
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,-0.198911,0,1,0,1,0,0,1,0,0
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.86271,0,1,0,0,0,1,1,0,0
4,38,0,6.436687,6.31896,0,0,1,1,15,0,...,0.13416,0,1,0,0,1,0,1,0,0


### Standardize the magnitude of the numeric features using a scaler

The 'eventdeath' feature will be the response variable, so it will be removed and set as y.

Because the values in the columns have different magnitudes and because the data is both categorical and continuous in nature, the data will be standardized. 

In [10]:
nki_bc_dummies.columns

Index(['age', 'eventdeath', 'survival', 'timerecurrence', 'chemo', 'hormonal',
       'amputation', 'histtype', 'diam', 'posnodes',
       ...
       'AF067420', 'grade_1', 'grade_2', 'grade_3', 'angioinv_1', 'angioinv_2',
       'angioinv_3', 'lymphinfil_1', 'lymphinfil_2', 'lymphinfil_3'],
      dtype='object', length=1573)

#### Create separate dataframe for variables that won't be scaled
Because some variables (eventdeath, chemo, hormonal, and ampuatation) were already assigned a 1 or 0 (yes or no), they will be considered dummy variables.

The gene expression levels were previously standardized on a scale of -2 to +2, so they will not be included in the variables that need to be scaled.

Thus, the dummy variables and the genes will be combined into another df

In [11]:
# Create dfs for dummy variables and genes
dummy_columns = nki_bc_dummies[['eventdeath', 'chemo', 'hormonal', 'amputation', 'grade_1', 'grade_2', 'grade_3', 'angioinv_1', 'angioinv_2',
       'angioinv_3', 'lymphinfil_1', 'lymphinfil_2', 'lymphinfil_3']]

gene_columns = nki_bc_dummies.drop(columns = ['eventdeath', 'chemo', 'hormonal', 'amputation', 'grade_1', 'grade_2', 'grade_3', 'angioinv_1', 'angioinv_2',
       'angioinv_3', 'lymphinfil_1', 'lymphinfil_2', 'lymphinfil_3', 'age', 'survival', 'timerecurrence', 'histtype', 'diam', 'posnodes'])

In [12]:
# Concatenate dummy_columns and genes
dummy_genes_df = pd.concat([dummy_columns, gene_columns], axis=1)

dummy_genes_df.head()

Unnamed: 0,eventdeath,chemo,hormonal,amputation,grade_1,grade_2,grade_3,angioinv_1,angioinv_2,angioinv_3,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,0,0,0,1,0,1,0,0,0,1,...,0.591103,-0.355018,0.373644,-0.76069,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,0,0,0,0,0,0,1,0,0,1,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,0,0,0,0,0,1,0,1,0,0,...,0.328736,-0.047571,0.084228,-0.69595,-0.40284,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,0,0,1,0,0,1,0,0,0,1,...,0.648861,-0.039088,0.182182,-0.52464,0.03732,-0.167688,-0.01679,-0.285344,-0.251188,0.86271
4,0,0,0,1,0,1,0,0,1,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.13416


In [13]:
# Define which columns should be scaled
columns_to_scale  = ['age', 'survival', 'timerecurrence', 'histtype', 'diam', 'posnodes']

columns_scaled = nki_bc_dummies[columns_to_scale]

columns_scaled.head()

Unnamed: 0,age,survival,timerecurrence,histtype,diam,posnodes
0,43,14.817248,14.817248,1,25,0
1,48,14.261465,14.261465,1,20,0
2,38,6.644764,6.644764,1,15,0
3,50,7.748118,7.748118,1,15,1
4,38,6.436687,6.31896,1,15,0


In [14]:
from sklearn.preprocessing import StandardScaler

# Instantiate scaler
scaler = StandardScaler()

In [15]:
# Don't want to scale dummy_genes_df

# Fit scaler to your data
scaler.fit(columns_scaled)

# Calculate scaled values and store them in a separate object
scaled_values = scaler.transform(columns_scaled)

# Create df with scaled_values
data = pd.DataFrame(scaled_values, index = columns_scaled.index, columns = columns_scaled.columns)
data.head()


Unnamed: 0,age,survival,timerecurrence,histtype,diam,posnodes
0,-0.192098,1.728368,1.814682,-0.202499,0.28439,-0.637497
1,0.724579,1.585775,1.681393,-0.202499,-0.291161,-0.637497
2,-1.108775,-0.368384,-0.145252,-0.202499,-0.866712,-0.637497
3,1.09125,-0.085305,0.119355,-0.202499,-0.866712,-0.162431
4,-1.108775,-0.421768,-0.223387,-0.202499,-0.866712,-0.637497


In [16]:
# Combine scaled columns 'data' with dummy variables and genes into one df
data2 = pd.concat([data, dummy_genes_df], axis=1)

data2.head()

Unnamed: 0,age,survival,timerecurrence,histtype,diam,posnodes,eventdeath,chemo,hormonal,amputation,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,-0.192098,1.728368,1.814682,-0.202499,0.28439,-0.637497,0,0,0,1,...,0.591103,-0.355018,0.373644,-0.76069,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,0.724579,1.585775,1.681393,-0.202499,-0.291161,-0.637497,0,0,0,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,-1.108775,-0.368384,-0.145252,-0.202499,-0.866712,-0.637497,0,0,0,0,...,0.328736,-0.047571,0.084228,-0.69595,-0.40284,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,1.09125,-0.085305,0.119355,-0.202499,-0.866712,-0.162431,0,0,1,0,...,0.648861,-0.039088,0.182182,-0.52464,0.03732,-0.167688,-0.01679,-0.285344,-0.251188,0.86271
4,-1.108775,-0.421768,-0.223387,-0.202499,-0.866712,-0.637497,0,0,0,1,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.13416


In [17]:
from sklearn.preprocessing import StandardScaler

# Declare an explanatory variable, called X, and assign it the result of dropping 'eventdeath' from the df
X = data2.drop(['eventdeath'], axis=1)

# Declare a response variable, called y, and assign it the eventdeath column of the df 
y = data2['eventdeath']

In [18]:
# Change y from Series to array
y = y.ravel()

In [19]:
# Import the train_test_split function from the sklearn.model_selection  
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
# Using 75/25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#### Note:
The 'eventdeath' column was not scaled because it has values of 0 or 1. The X and y were determined after scaling because of this. Also, the train and test sets were split after scaling because the dataset is not very large (only 272 observations/samples).

In [20]:
# Create csv file for data2 to use in future analyses
data2.to_csv('nki_bc_for_model.csv', index=False)