# Part 2: Feature Engineering
Now that we have completed exploratory data analysis, we are ready to begin feature engineering! Here are the activities we'll be completing as part of this notebook.

- Importing the proper Python libraries and raw dataset
- Using the proper Python libaries to properly engineer new features from the raw dataset
- Saving the "cleansed" feature engineered dataset back into the "/data/clean" directory

In [30]:
# Import the required Python libaries here

import pandas as pd
import matplotlib.pyplot as plt
from category_encoders.one_hot import OneHotEncoder

In [31]:
# Preliminary data cleanup from notebook 01

df = pd.read_csv('../data/raw/titanic-train-raw.csv')
df.drop(columns = ['PassengerId', 'Ticket', 'Cabin', 'Name'], inplace = True) # these are probably not useful
df.head(50)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [32]:
# fill NaN values in Age with the average ages of both genders
df = df.groupby(['Sex'], group_keys=False).apply(lambda x: x.fillna(round(x.mean())))
df.head(25)

  df = df.groupby(['Sex'], group_keys=False).apply(lambda x: x.fillna(round(x.mean())))


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,31.0,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [33]:
sex_ohe_encoder = OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore')
sex_dummies = sex_ohe_encoder.fit_transform(df['Sex'])

# sex_dummies = pd.get_dummies(df['Sex'], drop_first=False)
df = pd.concat([df, sex_dummies], axis = 1).drop('Sex', axis = 1)
#df = df[['Survived', 'Pclass', 'female', 'male', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
df


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male,Sex_female
0,0,3,22.0,1,0,7.2500,S,1,0
1,1,1,38.0,1,0,71.2833,C,0,1
2,1,3,26.0,0,0,7.9250,S,0,1
3,1,1,35.0,1,0,53.1000,S,0,1
4,0,3,35.0,0,0,8.0500,S,1,0
...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,S,1,0
887,1,1,19.0,0,0,30.0000,S,0,1
888,0,3,28.0,1,2,23.4500,S,0,1
889,1,1,26.0,0,0,30.0000,C,1,0


In [34]:
embarked_ohe_encoder = OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore')
embarked_dummies = embarked_ohe_encoder.fit_transform(df['Embarked'])
df = pd.concat([df, embarked_dummies], axis = 1).drop('Embarked', axis = 1)


# dummy2 = pd.get_dummies(df['Embarked'], drop_first=False, dummy_na = True)
# df = pd.concat([df, dummy2], axis = 1).drop('Embarked', axis = 1)
df.head(65)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan
0,0,3,22.0,1,0,7.2500,1,0,1,0,0,0
1,1,1,38.0,1,0,71.2833,0,1,0,1,0,0
2,1,3,26.0,0,0,7.9250,0,1,1,0,0,0
3,1,1,35.0,1,0,53.1000,0,1,1,0,0,0
4,0,3,35.0,0,0,8.0500,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
60,0,3,22.0,0,0,7.2292,1,0,0,1,0,0
61,1,1,38.0,0,0,80.0000,0,1,0,0,0,1
62,0,1,45.0,1,0,83.4750,1,0,1,0,0,0
63,0,3,4.0,3,2,27.9000,1,0,1,0,0,0


In [35]:
bin_labels = ['child', 'teen', 'young_adult', 'adult', 'elder']
bin_values = [-1, 12, 19, 30, 60, 100]

age_bins = pd.cut(df['Age'], bins = bin_values, labels = bin_labels)
df_age_bins = pd.DataFrame(age_bins)

age_ohe_encoder = OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore')
age_dummies = age_ohe_encoder.fit_transform(df_age_bins)
df = pd.concat([df, age_dummies], axis = 1).drop('Age', axis = 1)
df

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan,Age_child,Age_teen,Age_young_adult,Age_adult,Age_elder
0,0,3,1,0,7.2500,1,0,1,0,0,0,0,0,1,0,0
1,1,1,1,0,71.2833,0,1,0,1,0,0,0,0,0,1,0
2,1,3,0,0,7.9250,0,1,1,0,0,0,0,0,1,0,0
3,1,1,1,0,53.1000,0,1,1,0,0,0,0,0,0,1,0
4,0,3,0,0,8.0500,1,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,1,0,1,0,0,0,0,0,1,0,0
887,1,1,0,0,30.0000,0,1,1,0,0,0,0,1,0,0,0
888,0,3,1,2,23.4500,0,1,1,0,0,0,0,0,1,0,0
889,1,1,0,0,30.0000,1,0,0,1,0,0,0,0,1,0,0


In [36]:
df.to_csv('../data/clean/new_data.csv', index = False)
