# Feature Engineering

## Table of Contents

1. Split the data into training and test data set
2. Scale numerical features (normalization, standardization)
3. Encode categorical features to numeric ones
4. Save transformed data for model training

In [1]:
## Load the data 
import pandas as pd
data = pd.read_csv('../data/Breast_Cancer.csv')

## 1. Split the data into training and test data set

We split the data into a training and test data set because ...

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(data.drop(['Status'], axis=1),data['Status'], 
    test_size=0.20,
    random_state=42, 
    stratify=data['Status'])

## 2. Feature Scaling for Numeric 
1. ML algorithms don't work well when numeric features have very different scales
2. Apply either: 
    * Normalization (MinMaxScaler) bounds the values to a specific range (e.g. 0-1)
    * Standardization (StandardScaler) less affected by outliers does not bound to range

In [77]:
# get numeric feature columns and store them in num
num = []
for label, content in data.items():
    if pd.api.types.is_numeric_dtype(content): # check whether column is numerical
        print(label)
        num.append(label)
print(num)

Age
Tumor Size
Regional Node Examined
Reginol Node Positive
Survival Months
['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months']


In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_num = X_train[num]
X_test_num = X_test[num]

O = scaler.fit_transform(X_train_num)
P = pd.DataFrame(O, columns=X_train_num.columns, index=X_train_num.index)

Q = scaler.fit_transform(X_test_num)
R = pd.DataFrame(Q, columns=X_test_num.columns, index=X_test_num.index)

## 3. Encode Categorical Features into Numeric One

1. ML algorithms can't interpret non-numeric features 
2. Apply:
    * Label Encoder - categorical features are ordinal i.e. if you are converting severity or ranking
    * One-Hot Encoding - categorical are non-ordinal 

In [79]:
# get categorical feature columns and store them in cat
cat = []
for label, content in X_train.items():
  if not pd.api.types.is_numeric_dtype(content):
    print(label)
    num.append(label)
print(len(num))

Race
Marital Status
T Stage 
N Stage
6th Stage
differentiate
Grade
A Stage
Estrogen Status
Progesterone Status
15


In [80]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [81]:
# T Stage, N Stage, 6th Stage, differentiate, Grade, A Stage are ordinal 
ordinal = ['T Stage ', 'N Stage', '6th Stage', 'differentiate', 'Grade', 'A Stage']
X_train_cat_ord = X_train[ordinal]
X_test_cat_ord = X_test[ordinal]

X_train_cat_ord_transformed = MultiColumnLabelEncoder(columns = ordinal).fit_transform(X_train_cat_ord)
X_test_cat_ord_transformed = MultiColumnLabelEncoder(columns = ordinal).fit_transform(X_test_cat_ord)

In [82]:
# Race, Marital Status, Estrogen Status, and Progesterone Status are non-ordinal
non_ordinal = ['Race', 'Marital Status', 'Estrogen Status', 'Progesterone Status']

X_train_cat_non = X_train[non_ordinal]
X_test_cat_non = X_test[non_ordinal]

categories = ['Race','Marital Status']
binaries = ['Estrogen Status','Progesterone Status']

X_train_oneHot, X_test_oneHot = pd.get_dummies(X_train_cat_non, columns = categories), pd.get_dummies(X_test_cat_non, columns = categories)

X_train_oneHot["Estrogen Status"] = X_train_cat_non["Estrogen Status"].apply(lambda x: 1 if x in ["Positive"] else 0)
X_test_oneHot["Estrogen Status"] = X_test_cat_non["Estrogen Status"].apply(lambda x: 1 if x in ["Positive"] else 0)

X_train_oneHot["Progesterone Status"] = X_train_cat_non["Progesterone Status"].apply(lambda x: 1 if x in ["Positive"] else 0)
X_test_oneHot['Progesterone Status'] = X_test_cat_non['Progesterone Status'].apply(lambda x: 1 if x in ["Positive"] else 0)

X_test_oneHot.head()

Unnamed: 0,Estrogen Status,Progesterone Status,Race_Black,Race_Other,Race_White,Marital Status_Divorced,Marital Status_Married,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
304,1,1,0,0,1,0,1,0,0,0
445,1,0,0,0,1,0,1,0,0,0
3368,1,1,0,0,1,0,1,0,0,0
1383,1,1,0,0,1,0,1,0,0,0
3584,1,1,0,0,1,1,0,0,0,0


## 3.1 Encoding Status
Transform the target label from categorical to numerical

In [83]:
y_train_prepared =  y_train.apply(lambda x: 1 if x in ["Alive"] else 0)
y_test_prepared =  y_test.apply(lambda x: 1 if x in ["Alive"] else 0)

## 4. Save transformed data for model training

In [90]:
X_train_prepared = pd.concat([P, X_train_cat_ord_transformed, X_train_oneHot],axis=1)
X_test_prepared = pd.concat([R, X_test_cat_ord_transformed, X_test_oneHot],axis=1)

In [94]:
#Save the transformed data into data/transform folder

import numpy as np

np.savetxt(config.traintest_path + "X_train_prepared.csv", X_train_prepared_m, delimiter=",")
np.savetxt(config.traintest_path + "X_train_prepared.csv", X_train_prepared, delimiter=",")
np.savetxt(config.traintest_path + "X_train.csv", X_train, delimiter=",")
np.savetxt(config.traintest_path + "X_test.csv", X_test, delimiter=",")
np.savetxt(config.traintest_path + "y_train_prepared.csv", y_train_prepared, delimiter=",")
np.savetxt(config.traintest_path + "y_test_prepared.csv", y_test_prepared, delimiter=",")

ModuleNotFoundError: No module named 'np'