Note: This is the refined-version of `feature_engineering_detailed.ipynb`, for more detailed analysis of data, check out the file.

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from feature_creation import cate_colName, Group_Statistics, Target_Encode
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import OrdinalEncoder

data_train = pd.read_csv('../../data/train.csv')
data_test = pd.read_csv('../../data/test.csv')

In [2]:
category_cols = ['Pclass', 'Sex', 'SibSp', 'Parch',
                'Embarked', 'Cabin']

numeric_cols = ['Age', 'Fare']

target = 'Survived'

discard_cols = ['PassengerId', 'Ticket', 'Name']

assert len(category_cols) + len(numeric_cols) + len(discard_cols) + 1 == data_train.shape[1]

In [3]:
if 'Cabin' in category_cols: category_cols.remove('Cabin')
discard_cols.append('Cabin')

data_train['Sex'].replace(to_replace='female', value=0, inplace=True)
data_train['Sex'].replace(to_replace='male',  value=1, inplace=True)

data_test['Sex'].replace(to_replace='female', value=0, inplace=True)
data_test['Sex'].replace(to_replace='male',  value=1, inplace=True)

data_train['Embarked'].replace(to_replace='C', value=0, inplace=True)
data_train['Embarked'].replace(to_replace='Q',  value=1, inplace=True)
data_train['Embarked'].replace(to_replace='S',  value=2, inplace=True)

data_test['Embarked'].replace(to_replace='C', value=0, inplace=True)
data_test['Embarked'].replace(to_replace='Q',  value=1, inplace=True)
data_test['Embarked'].replace(to_replace='S',  value=2, inplace=True)

features_train = data_train.drop(columns=discard_cols + [target]).copy()
features_test = data_test.drop(columns=discard_cols).copy()
labels = data_train[target].copy()

In [4]:
features_train['Age'] = pd.cut(np.array(features_train['Age']), bins=5).codes
intervals = pd.cut(np.array(features_train['Age']), bins=5).categories
features_test['Age'] = pd.cut(np.array(features_test['Age']), bins=intervals).codes

dummy_features_train = features_train.copy()
dummy_features_test = features_train.copy()

# fill missing value on training data
dummy_features_train.sort_values('Pclass', ascending=False, inplace=True)

for i in range(dummy_features_train.shape[0]):
    if i == 0 and dummy_features_train['Age'][i] == -1:
        for j in range(i, dummy_features_train.shape[0]):
            if dummy_features_train['Age'][j] != -1:
                dummy_features_train['Age'][i] = dummy_features_train['Age'][j]
                break
    if dummy_features_train['Age'][i] == -1:
        for j in range(i, -1, -1):
            if dummy_features_train['Age'][j] != -1:
                dummy_features_train['Age'][i] = dummy_features_train['Age'][j]
                break

# apply training filling to test sample(one by one)
for i in range(features_test.shape[0]):
    if dummy_features_test['Age'][i] == -1:
        approx_key = dummy_features_test['Pclass'][i]   # get its pclass as "approximate key"
        dummy_features_test['Age'][i] = dummy_features_train['Pclass'][dummy_features_train.shape[0]-1]
        for j in range(features_train.shape[0]):        # and fill 'Age' in test set with data from training set
            if dummy_features_train['Pclass'][j] == approx_key:
                dummy_features_test['Age'][i] = dummy_features_train['Pclass'][j]                   

# change training feature order back
dummy_features_train_reset = dummy_features_train.reset_index()
dummy_features_train_reset.sort_values('index', ascending=True, inplace=True)
dummy_features_train_reset = dummy_features_train_reset.drop(columns='index')
dummy_features_train = dummy_features_train_reset.reset_index().drop(columns='index')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_features_train['Age'][i] = dummy_features_train['Age'][j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_features_test['Age'][i] = dummy_features_train['Pclass'][dummy_features_train.shape[0]-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_features_test['Age'][i] = dummy_features_train['Pclass'][j]


In [5]:
# training set: fill all rest empties
for j in range(dummy_features_train.shape[1]):
    for i in range(dummy_features_train.shape[0]):
        if np.isnan(dummy_features_train.iloc[i, j]):
            dummy_features_train.iloc[i, j] = dummy_features_train.iloc[:, j].mode()

# test set: fill all rest empties with training set
for j in range(dummy_features_test.shape[1]):
    for i in range(dummy_features_test.shape[0]):
        if np.isnan(dummy_features_test.iloc[i, j]):
            dummy_features_test.iloc[i, j] = dummy_features_train.iloc[:, j].mode()

In [6]:
dummy_features_train.to_csv('../../data/X_train_process_checkpoint0.csv', index=False)
dummy_features_test.to_csv('../../data/X_test_process_checkpoint0.csv', index=False)

In [7]:
features_train = pd.read_csv("../../data/X_train_process_checkpoint0.csv")
features_test = pd.read_csv("../../data/X_test_process_checkpoint0.csv")

In [8]:
colNames = list(features_train.columns)    
colNames.remove("Fare")     # Fare is continuous, remove it

colNames_new_l = []
features_train_new_l = []   # new list
features_test_new_l = []   # new list

for col_index, col_name in enumerate(colNames):
    for col_sub_index in range(col_index+1, len(colNames)):
        newNames = col_name + '&' + colNames[col_sub_index]

for col_index, col_name in enumerate(colNames):
    for col_sub_index in range(col_index+1, len(colNames)):
        newNames = col_name + '&' + colNames[col_sub_index]
        colNames_new_l.append(newNames)
        newDF_train = pd.Series(features_train[col_name].astype('str') 
                          + '&'
                          + features_train[colNames[col_sub_index]].astype('str'), 
                          name=col_name)
        newDF_test = pd.Series(features_test[col_name].astype('str') 
                          + '&'
                          + features_test[colNames[col_sub_index]].astype('str'), 
                          name=col_name)
        features_train_new_l.append(newDF_train)
        features_test_new_l.append(newDF_test)

features_train_new = pd.concat(features_train_new_l, axis=1)
features_test_new = pd.concat(features_test_new_l, axis=1)
features_train_new.columns = colNames_new_l
features_test_new.columns = colNames_new_l

enc = preprocessing.OneHotEncoder()
enc.fit_transform(features_train_new)

cate_colName(enc, colNames_new_l, drop=None)
features_train_new_af = pd.DataFrame(enc.fit_transform(features_train_new).toarray(), 
                               columns = cate_colName(enc, colNames_new_l, drop=None))
features_test_new_af = pd.DataFrame(enc.fit_transform(features_test_new).toarray(), 
                               columns = cate_colName(enc, colNames_new_l, drop=None))
features_train_temp = pd.concat([features_train, features_train_new_af], axis=1)
features_test_temp = pd.concat([features_test, features_test_new_af], axis=1)

# only train set goes to filtering to obtain column names
sel = VarianceThreshold()
sel.fit(features_train_temp)
CrossComb_cols = features_train_temp.columns[sel.variances_ > 0.01 * 0.99]

chi2_p = chi2(features_train_temp[CrossComb_cols], labels)[1]
chi2_CrossComb_cols = []
for pValue, colname in zip(chi2_p, CrossComb_cols):
    if pValue < 0.01:
        chi2_CrossComb_cols.append(colname)

MI = mutual_info_classif(features_train_temp[CrossComb_cols], labels, discrete_features=True, random_state=22)
MI_threshold = MI.mean() * 0.1
MI_CrossComb_cols = []
for MIvalue, colname in zip(MI, CrossComb_cols):
    if MIvalue > MI_threshold:
        MI_CrossComb_cols.append(colname)

CrossComb_cols_select = list(set(chi2_CrossComb_cols) & set(MI_CrossComb_cols))

# apply selected column names to test data
features_train_temp_cb = features_train_temp[CrossComb_cols_select]
features_test_temp_cb = features_test_temp[CrossComb_cols_select]



In [9]:
features_train_temp_cb.to_csv('../../data/X_train_process_checkpoint1.csv', index=False)
features_test_temp_cb.to_csv('../../data/X_test_process_checkpoint1.csv', index=False)

In [10]:
ord_enc = OrdinalEncoder()
ord_enc.fit(features_train[category_cols])

X_train_OE = pd.DataFrame(ord_enc.transform(features_train[category_cols]), columns=category_cols)
X_train_OE.index = features_train.index
X_train_OE = pd.concat([X_train_OE, features_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(ord_enc.transform(features_test[category_cols]), columns=category_cols)
X_test_OE.index = features_test.index
X_test_OE = pd.concat([X_test_OE, features_test[numeric_cols]], axis=1)

# obtain key cols on training set
chi2_p = chi2(X_train_OE[category_cols], labels)[1]
chi2_select_cols = []
for pValue, colname in zip(chi2_p, category_cols):
    if pValue < 0.01: chi2_select_cols.append(colname)

MI = mutual_info_classif(X_train_OE[category_cols], labels, discrete_features=True, random_state=22)
MI_select_cols = []
MI_threshold = MI.mean() * 0.1
for MIvalue, colname in zip(MI, category_cols):
    if MIvalue > MI_threshold: MI_select_cols.append(colname)

keycols = list(set(chi2_select_cols) & set(MI_select_cols))
cat_rest = []   # categorical variables that are not keyCols
for col in category_cols:
    if col not in keycols: cat_rest.append(col)

col_temp = keycols.copy()
GroupStat_train = pd.DataFrame()
GroupStat_test = pd.DataFrame()

# apply key cols to test set
for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    gp_features_train_new, gp_features_test_new, gp_colNames_train_new, gp_colNames_test_new = \
        Group_Statistics(keyCol, X_train_OE, X_test_OE, numeric_cols, col_temp+cat_rest)
    
    GroupStat_train = pd.concat([GroupStat_train, gp_features_train_new], axis=1)
    GroupStat_test = pd.concat([GroupStat_test, gp_features_test_new], axis=1)

    col_temp = keycols.copy()

# obtain GroupStat_cols_select on training set
sel = VarianceThreshold()
sel.fit(GroupStat_train)

GroupStat_cols = list(GroupStat_train.columns[sel.variances_ > 0])
temp_df_train = GroupStat_train[GroupStat_cols]
temp_df_test = GroupStat_test[GroupStat_cols]

for j in range(temp_df_train.shape[1]):
    for i in range(temp_df_train.shape[0]):
        if np.isnan(temp_df_train.iloc[i, j]):
            temp_df_train.iloc[i, j] = temp_df_train.iloc[:, j].mode()

for j in range(temp_df_test.shape[1]):
    for i in range(temp_df_test.shape[0]):
        if np.isnan(temp_df_test.iloc[i, j]):
            temp_df_test.iloc[i, j] = temp_df_train.iloc[:, j].mode()

f_classif_p = f_classif(temp_df_train, labels)[1]
f_classif_GroupStat_cols = []
for pValue, colname in zip(f_classif_p, GroupStat_cols):
    if pValue < 0.01:
        f_classif_GroupStat_cols.append(colname)
        
MI = mutual_info_classif(temp_df_train, labels, random_state=22)
MI_threshold = MI.mean() * 0.1
MI_GroupStat_cols = []
for MIvalue, colname in zip(MI, GroupStat_cols):
    if MIvalue > MI_threshold:
        MI_GroupStat_cols.append(colname)


GroupStat_cols_select = list(set(f_classif_GroupStat_cols) & set(MI_GroupStat_cols))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df_test.iloc[i, j] = temp_df_train.iloc[:, j].mode()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df_test.iloc[i, j] = temp_df_train.iloc[:, j].mode()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df_test.iloc[i, j] = temp_df_train.iloc[:, j].mode()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [11]:
temp_df_train[GroupStat_cols_select].to_csv('../../data/X_train_process_checkpoint2.csv', index=False)
temp_df_test[GroupStat_cols_select].to_csv('../../data/X_test_process_checkpoint2.csv', index=False)

In [12]:
X_test = features_test

X_test_OE = pd.DataFrame(ord_enc.transform(X_test[category_cols]), columns=category_cols)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [13]:
col_cat = [target]

col_temp = category_cols.copy()
TarEnc_train = pd.DataFrame()
TarEnc_test = pd.DataFrame()

for keyCol in col_temp:
    features_train1, features_test1, colNames_train_new, colNames_test_new = Target_Encode(keyCol, 
                                                                                           X_train_OE, 
                                                                                           labels,
                                                                                           X_test_OE, 
                                                                                           col_cat=col_cat, 
                                                                                           extension=True)
    
    TarEnc_train = pd.concat([TarEnc_train, features_train1],axis=1)
    TarEnc_test = pd.concat([TarEnc_test, features_test1],axis=1)
    
    col_temp = category_cols.copy()

sel = VarianceThreshold()
sel.fit(TarEnc_train)
TarEnc_cols = list(TarEnc_train.columns[sel.variances_ > 0])
TarEnc_train_temp = TarEnc_train[TarEnc_cols]
TarEnc_test_temp = TarEnc_test[TarEnc_cols]

for j in range(TarEnc_train_temp.shape[1]):
    for i in range(TarEnc_train_temp.shape[0]):
        if np.isnan(TarEnc_train_temp.iloc[i, j]):
            TarEnc_train_temp.iloc[i, j] = temp_df_train.iloc[:, j].mean()

for j in range(TarEnc_test_temp.shape[1]):
    for i in range(TarEnc_test_temp.shape[0]):
        if np.isnan(TarEnc_test_temp.iloc[i, j]):
            TarEnc_test_temp.iloc[i, j] = temp_df_train.iloc[:, j].mean()
            
f_classif_p = f_classif(TarEnc_train_temp, labels)[1]
f_classif_TarEnc_cols = []
for pValue, colname in zip(f_classif_p, TarEnc_cols):
    if pValue < 0.01:
        f_classif_TarEnc_cols.append(colname)
        
MI = mutual_info_classif(TarEnc_train_temp, labels, random_state=22)
MI_threshold = MI.mean() * 0.01
MI_TarEnc_cols = []
for MIvalue, colname in zip(MI, TarEnc_cols):
    if MIvalue > MI_threshold: MI_TarEnc_cols.append(colname)

TarEnc_cols_select = list(set(f_classif_TarEnc_cols) & set(MI_TarEnc_cols))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TarEnc_train_temp.iloc[i, j] = temp_df_train.iloc[:, j].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TarEnc_test_temp.iloc[i, j] = temp_df_train.iloc[:, j].mean()


In [14]:
TarEnc_train[TarEnc_cols_select].to_csv("../../data/X_train_process_checkpoint3.csv", index = False)
TarEnc_test[TarEnc_cols_select].to_csv("../../data/X_test_process_checkpoint3.csv", index = False)

In [15]:
# target encoding might lead to overfitting
# so if it happens, turn it off 
target_encoding_flag = True

labels = pd.read_csv("../../data/train.csv")["Survived"]
df0 = pd.read_csv("../../data/X_train_process_checkpoint0.csv")
df1 = pd.read_csv("../../data/X_train_process_checkpoint1.csv")
df2 = pd.read_csv("../../data/X_train_process_checkpoint2.csv")
if(target_encoding_flag):
    df3 = pd.read_csv("../../data/X_train_process_checkpoint3.csv")
    features_final = pd.concat([labels, df0, df1, df2, df3], axis = 1)
else:
    features_final = pd.concat([labels, df0, df1, df2], axis = 1)

features_final.to_csv("../../data/train_new.csv", index=False)

df0 = pd.read_csv("../../data/X_test_process_checkpoint0.csv")
df1 = pd.read_csv("../../data/X_test_process_checkpoint1.csv")
df2 = pd.read_csv("../../data/X_test_process_checkpoint2.csv")
if target_encoding_flag is True:
    df3 = pd.read_csv("../../data/X_test_process_checkpoint3.csv")
    features_final = pd.concat([df0, df1, df2, df3], axis = 1)
else:
    features_final = pd.concat([df0, df1, df2], axis = 1)

features_final.to_csv("../../data/test_new.csv", index=False)