### <font color='purple'> Part 3 of the Titanic Case -- Feature Engineering</font>

Here comes to the most tricky part. As many Kagglers said, Kaggle competition is more like competing in feature engineering and in most cases, applying a right model isn't quite a big deal. Although it's only a startup project, the lessons we can learn in feature engineering is pretty typical.

#### 1. Title in the same field

Maybe people in some title tend to be saved. OMG, I'm just guessing......

In [1]:
import numpy as np
import pandas as pd

In [8]:
# Find a substring in a string; will be used when finding title
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    return np.nan

In [146]:
from sklearn import preprocessing

# Used for transform features later
# LabelEncoder will label distinct value but will not split into 0-1 columns like one-hot
le = preprocessing.LabelEncoder()
enc = preprocessing.OneHotEncoder()

# All the feature works in a function
def clean_data(df):
    #df.Fare = df.Fare.map(lambda x:np.nan if x==0)
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']
    df['Title'] = df['Name'].map(lambda x:substrings_in_string(x,title_list))
    # Process speical titles into four kinds
    # Miss, Mrs, Mr, Master
    def replace_titles(x):
        title=x['Title']
        if title in ['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Master']:
            return 'Master'
        elif title in ['Countess', 'Mme','Mrs']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms','Miss']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        elif title =='':
            if x['Sex']=='Male':
                return 'Master'
            else:
                return 'Miss'
        else:
            return title
    df['Title'] = df.apply(replace_titles,axis=1) # every column means row by row
    
    # Define family attribute
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Family'] = df['SibSp']*df['Parch']
    
    # Transform sex to int value
    df['Gender'] = df['Sex'].map({'female':0, 'male':1})
    
    # Fill null value in age with mean of same title
    df['AgeFill'] = df['Age']
    mean_ages = np.zeros(4)
    mean_ages[0] = np.average(df[df.Title == 'Miss']['Age'].dropna())
    mean_ages[1] = np.average(df[df.Title == 'Mrs']['Age'].dropna())
    mean_ages[2] = np.average(df[df.Title == 'Mr']['Age'].dropna())
    mean_ages[3] = np.average(df[df.Title == 'Master']['Age'].dropna())
    df.loc[(df.Title == 'Miss') & (df.Age.isnull()), 'AgeFill'] = mean_ages[0]
    df.loc[(df.Title == 'Mrs') & (df.Age.isnull()), 'AgeFill'] = mean_ages[1]
    df.loc[(df.Title == 'Mr') & (df.Age.isnull()), 'AgeFill'] = mean_ages[2]
    df.loc[(df.Title == 'Master') & (df.Age.isnull()), 'AgeFill'] = mean_ages[3]
    
    # Set category for different age ranges
    df['AgeCat'] = df['AgeFill']
    df.loc[(df.AgeFill <= 10), 'AgeCat'] = 'child'
    df.loc[(df.AgeFill > 60), 'AgeCat'] = 'aged'
    df.loc[(df.AgeFill > 10) & (df.AgeFill <= 30),'AgeCat'] = 'adult'
    df.loc[(df.AgeFill > 30) & (df.AgeFill <= 60),'AgeCat'] = 'senior'
    
    df.Embarked = df.Embarked.fillna('S')
    
    df.loc[ df.Cabin.isnull()==True,'Cabin'] = 0
    df.loc[ df.Cabin.isnull()==False,'Cabin'] = 1
    df.loc[(df.Fare.isnull()),'Fare'] = df.loc[(df.Pclass == 3),'Fare'].mean()
    # Calculate the fare by family size the larger the family size, the higher the fare
    df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
    
    # Combine features
    df['AgeClass']=df['AgeFill']*df['Pclass']
    df['ClassFare']=df['Pclass']*df['Fare_Per_Person']
    
    df['HighLow']='Low'
    #df.Fare_Per_Person[df.Fare_Per_Person.isnull()] = 
    df.loc[ (df.Fare_Per_Person<8),'HighLow'] = 'Low'
    df.loc[ (df.Fare_Per_Person>=8) ,'HighLow'] = 'High'
    print(df.HighLow.value_counts())
    
    dummies_Title = pd.get_dummies(df.Title, prefix='Title')
    dummies_AgeCat = pd.get_dummies(df.AgeCat, prefix='AgeCat')
    dummies_Embarked = pd.get_dummies(df.Embarked, prefix='Embarked')
    le.fit(df['HighLow'])
    x_hl = le.transform(df['HighLow'])
    df['HighLow'] = x_hl.astype(np.int32)
    
    df = pd.concat([df, dummies_Title, dummies_AgeCat,dummies_Embarked], axis=1)
    df.drop(['PassengerId', 'Name', 'Age', 'Ticket', 'Title', 'AgeCat', 'AgeFill', 'Embarked',
            'Sex','Fare'],axis=1,inplace=True)
    
    return df
    
    

In [147]:
train_data = pd.read_csv("train.csv")
train_df = clean_data(train_data)

High    504
Low     387
Name: HighLow, dtype: int64


In [41]:
train_df.head(10)

Unnamed: 0,Survived,Pclass,SibSp,Parch,Cabin,Family_Size,Family,Gender,Fare_Per_Person,AgeClass,...,Title_Miss,Title_Mr,Title_Mrs,AgeCat_adult,AgeCat_aged,AgeCat_child,AgeCat_senior,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,0,1,1,0,1,3.625,66.0,...,0,1,0,1,0,0,0,0,0,1
1,1,1,1,0,1,1,0,0,35.64165,38.0,...,0,0,1,0,0,0,1,1,0,0
2,1,3,0,0,1,0,0,0,7.925,78.0,...,1,0,0,1,0,0,0,0,0,1
3,1,1,1,0,1,1,0,0,26.55,35.0,...,0,0,1,0,0,0,1,0,0,1
4,0,3,0,0,1,0,0,1,8.05,105.0,...,0,1,0,0,0,0,1,0,0,1
5,0,3,0,0,1,0,0,1,8.4583,98.675971,...,0,1,0,0,0,0,1,0,1,0
6,0,1,0,0,1,0,0,1,51.8625,54.0,...,0,1,0,0,0,0,1,0,0,1
7,0,3,3,1,1,4,3,1,4.215,6.0,...,0,0,0,0,0,1,0,0,0,1
8,1,3,0,2,1,2,0,0,3.7111,81.0,...,0,0,1,1,0,0,0,0,0,1
9,1,2,1,0,1,1,0,0,15.0354,28.0,...,0,0,1,1,0,0,0,1,0,0


In [42]:
train_df.columns

Index(['Survived', 'Pclass', 'SibSp', 'Parch', 'Cabin', 'Family_Size',
       'Family', 'Gender', 'Fare_Per_Person', 'AgeClass', 'ClassFare',
       'HighLow', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'AgeCat_adult', 'AgeCat_aged', 'AgeCat_child', 'AgeCat_senior',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [84]:
from sklearn.cross_validation import train_test_split
filt_regex = "Survived|Pclass|SibSp|Parch|Cabin|Family|Family_Size|Gender|Fare_Per_Person|AgeClass|ClassFare|\
              HigLow|Title_.*|AgeCat_.*|Embarked_.*"
df_array = train_df.filter(regex=filt_regex).as_matrix()
y = list(df_array[:,0])
X = list(df_array[:,1:])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [67]:
clf.get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 1,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [89]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [157]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from operator import itemgetter
from sklearn.cross_validation import cross_val_score

clf=RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=2,
  min_samples_leaf=1, max_features='auto',bootstrap=False, oob_score=False, n_jobs=1, random_state=1,
  verbose=0)
#param_grid = dict()
param_grid = dict({'max_depth':[2,4,6], 'bootstrap':[True, False], 'min_samples_split':[2,4,6]} )
pipeline=Pipeline([ ('clf',clf) ])
grid_search = GridSearchCV(clf, param_grid=param_grid, verbose=3,scoring='accuracy',\
cv=StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, train_size=None,  \
random_state=1)).fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
report(grid_search.grid_scores_)

print('-----grid search end------------')
print ('on all train set')
scores = cross_val_score(grid_search.best_estimator_, X_train, y_train,cv=3,scoring='accuracy')
print(scores.mean(),scores)
print ('on test set')
scores = cross_val_score(grid_search.best_estimator_, X_test, y_test,cv=3,scoring='accuracy')
print(scores.mean(),scores)

# 对结果打分

print(classification_report(y_train, grid_search.best_estimator_.predict(X_train) ))
print('test data')
print(classification_report(y_test, grid_search.best_estimator_.predict(X_test) ))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] bootstrap=True, max_depth=2, min_samples_split=2 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=2, score=0.797203 -   0.8s
[CV] bootstrap=True, max_depth=2, min_samples_split=2 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  bootstrap=True, max_depth=2, min_samples_split=2, score=0.790210 -   0.7s
[CV] bootstrap=True, max_depth=2, min_samples_split=2 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  bootstrap=True, max_depth=2, min_samples_split=2, score=0.818182 -   0.6s
[CV] bootstrap=True, max_depth=2, min_samples_split=2 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=2, score=0.825175 -   0.6s
[CV] bootstrap=True, max_depth=2, min_samples_split=2 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=2, score=0.853147 -   0.6s
[CV] bootstrap=True, max_depth=2, min_samples_split=4 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=4, score=0.797203 -   0.6s
[CV] bootstrap=True, max_depth=2, min_samples_split=4 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=4, score=0.790210 -   0.7s
[CV] bootstrap=True, max_depth=2, min_samples_split=4 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=4, score=0.818182 -   0.6s
[CV] bootstrap=True, max_depth=2, min_samples_split=4 ................
[CV]  bootstrap=True, max_depth=2, min_samples_split=4, score=0.825175 -   0.6s
[CV] bootstrap

[CV]  bootstrap=False, max_depth=2, min_samples_split=6, score=0.818182 -   0.5s
[CV] bootstrap=False, max_depth=2, min_samples_split=6 ...............
[CV]  bootstrap=False, max_depth=2, min_samples_split=6, score=0.825175 -   0.6s
[CV] bootstrap=False, max_depth=2, min_samples_split=6 ...............
[CV]  bootstrap=False, max_depth=2, min_samples_split=6, score=0.853147 -   0.6s
[CV] bootstrap=False, max_depth=4, min_samples_split=2 ...............
[CV]  bootstrap=False, max_depth=4, min_samples_split=2, score=0.804196 -   0.6s
[CV] bootstrap=False, max_depth=4, min_samples_split=2 ...............
[CV]  bootstrap=False, max_depth=4, min_samples_split=2, score=0.818182 -   0.6s
[CV] bootstrap=False, max_depth=4, min_samples_split=2 ...............
[CV]  bootstrap=False, max_depth=4, min_samples_split=2, score=0.846154 -   0.6s
[CV] bootstrap=False, max_depth=4, min_samples_split=2 ...............
[CV]  bootstrap=False, max_depth=4, min_samples_split=2, score=0.832168 -   0.6s
[CV] bo

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 25.6min finished


Best score: 0.841
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)
Model with rank: 1
Mean validation score: 0.841 (std: 0.027)
Parameters: {'bootstrap': True, 'max_depth': 6, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.841 (std: 0.027)
Parameters: {'bootstrap': True, 'max_depth': 6, 'min_samples_split': 4}

Model with rank: 3
Mean validation score: 0.841 (std: 0.027)
Parameters: {'bootstrap': True, 'max_depth': 6, 'min_samples_split': 6}

-----grid search end------------
on all train set
0.830081185016 [ 0.80252101  0.8487395   0.83898305]
on test set
0.743076780587 [ 0.73770492  0.76271186  0.72881356]
             precision    recal

In [158]:
test_data = pd.read_csv('test.csv')
test_df = clean_data(test_data)

High    234
Low     184
Name: HighLow, dtype: int64


In [123]:
test_df.head()

Unnamed: 0,Pclass,SibSp,Parch,Cabin,Family_Size,Family,Gender,Fare_Per_Person,AgeClass,ClassFare,...,Title_Miss,Title_Mr,Title_Mrs,AgeCat_adult,AgeCat_aged,AgeCat_child,AgeCat_senior,Embarked_C,Embarked_Q,Embarked_S
0,3,0,0,1,0,0,1,7.8292,103.5,23.4876,...,0,1,0,0,0,0,1,0,1,0
1,3,1,0,1,1,0,0,3.5,141.0,10.5,...,0,0,1,0,0,0,1,0,0,1
2,2,0,0,1,0,0,1,9.6875,124.0,19.375,...,0,1,0,0,1,0,0,0,1,0
3,3,0,0,1,0,0,1,8.6625,81.0,25.9875,...,0,1,0,1,0,0,0,0,0,1
4,3,1,1,1,2,1,0,4.095833,66.0,12.2875,...,0,0,1,1,0,0,0,0,0,1


In [161]:
test_array = test_df.filter(regex=filt_regex).as_matrix()
a = grid_search.best_estimator_.predict(test_array)


In [162]:
a

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0,

In [163]:
result_df = pd.DataFrame({'PassengerId':test_data.PassengerId, 'Survived':a})
result_df.to_csv("logistic_regression_predictions3.csv", index=False)

In [164]:
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
