In [1]:
import pandas as pd 
import numpy as np 
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from src.features.build_features import load_data
from src.features.build_features import add_rel_features

# Load Data 

In [2]:
train_data = load_data("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Transform data

## Add new columns

In [None]:
# Relatives on board
relatives = ['SibSp' ,'Parch']
add_rel_features(train_data,relatives)

In [None]:
# AgeBucket 
# try binning Age into AgeBucket 
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

In [12]:
def add_AgeBucket_feature(df, column_name='Age', bin_size=15, add=True):
    '''Adds extra feature to data.

    If boolean is true adds age bucket.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        dataframe to modify
    column_name : string (optional)
        columns containing relative data
    bin_size = int (optional)
        number of categories 
    add: boolean (optional)
        trigger for feature 

    Returns
    -------
    df : pandas.core.frame.DataFrame
        transformed df .
    '''
    logger.debug('Adding Age column')
    if add:
        df['AgeBucket'] = df[column_names] // bin_size * bin_size
        return df

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
num_attribs = ["Age", "SibSp", "Parch", "Fare"]

num_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="median")) ])

In [7]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,RelativeOnboard
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [5]:
train_data[['SibSp' ,'Parch']].sum(axis=1).values.reshape(1,-1).shape

(1, 891)

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
num_attribs = ["Age", "SibSp", "Parch", "Fare"]

num_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="median")) ])