In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [10]:
train_df = pd.read_csv('../data/train.csv')
print(train_df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [11]:
test_df = pd.read_csv('../data/test.csv')
print(test_df.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [12]:
def transform_features(df):
    # Base feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['WomanOrChild'] = ((df['Sex'] == 'female') | (df['Age'] <= 12)).astype(int)
    df['IsRich'] = (df['Fare'] > 50).astype(int)
    df['IsFirstClass'] = (df['Pclass'] == 1).astype(int)
    df['CabinKnown'] = df['Cabin'].notnull().astype(int)

    # Age feature
    age_bins = [0, 12, 18, 30, 45, 60, 80]
    labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Middle-aged', 'Senior']
    df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=labels)

    df['AgeGroup_encoded'] = df['AgeGroup'].cat.codes

    return df


In [13]:
train_df = transform_features(train_df)
test_df = transform_features(test_df)


In [14]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,WomanOrChild,IsRich,IsFirstClass,CabinKnown,AgeGroup,AgeGroup_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,0,0,0,0,Young Adult,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,1,1,1,1,Adult,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,1,0,0,0,Young Adult,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,1,1,1,1,Adult,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,0,0,0,0,Adult,3


In [15]:
train_df.to_csv('../data/train_engineered_1.csv', index=False)


In [16]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,WomanOrChild,IsRich,IsFirstClass,CabinKnown,AgeGroup,AgeGroup_encoded
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,1,0,0,0,0,Adult,3
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2,0,1,0,0,0,Middle-aged,4
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,1,0,0,0,0,Senior,5
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,1,0,0,0,0,Young Adult,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3,0,1,0,0,0,Young Adult,2


In [None]:
test_df.to_csv('../data/test_engineered_1.csv', index=False)
