https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python

In [1]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_train.shape

(891, 12)

In [4]:
df_test = pd.read_csv('test.csv')

df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df_test.shape

(418, 11)

In [6]:
round(df_train['Survived'].value_counts(normalize = True),2)

# 0 = No | 1 = Yes

0    0.62
1    0.38
Name: Survived, dtype: float64

## Feature Engineering

In [7]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
dict_df = {'key_df_train':df_train,'key_df_test':df_test}

In [9]:
def get_title(Name):
    title_search = re.search(' ([A-Za-z]+)\.',Name)
    if title_search:
        return title_search.group(1)
    return ''

In [10]:
for df in dict_df.values():
    df['Title'] = df['Name'].apply(get_title)
    df['Title'] = df['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
    df['Title'] = df['Title'].replace('Mlle','Miss')
    df['Title'] = df['Title'].replace('Ms','Miss')
    df['Title'] = df['Title'].replace('Mme','Mrs')
    df['Title'] = df['Title'].map({'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Rare':5})
    df['Name_length'] = df['Name'].apply(len)

In [11]:
for df in dict_df.values():
    df['Sex'] = df['Sex'].map({'female':0,'male':1} ).astype(int)

In [12]:
age_avg = df_train['Age'].mean()
age_std = df_train['Age'].std()

for df in dict_df.values():
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std,age_avg + age_std,size = age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

In [13]:
df_train['CategoricalAge'] = pd.cut(df_train['Age'],5)

df_train['CategoricalAge'].value_counts().sort_index()

(-0.08, 16.0]    114
(16.0, 32.0]     453
(32.0, 48.0]     244
(48.0, 64.0]      69
(64.0, 80.0]      11
Name: CategoricalAge, dtype: int64

In [14]:
for df in dict_df.values():
    df.loc[df['Age'] <= 16.0,'Age'] = 0
    df.loc[(df['Age'] > 16.0) & (df['Age'] <= 32.0),'Age'] = 1
    df.loc[(df['Age'] > 32.0) & (df['Age'] <= 48.0),'Age'] = 2
    df.loc[(df['Age'] > 48.0) & (df['Age'] <= 64.0),'Age'] = 3
    df.loc[df['Age'] > 64.0,'Age'] = 4

In [15]:
for df in dict_df.values():
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1,'IsAlone'] = 1

In [16]:
for df in dict_df.values():
    df['Fare'] = df['Fare'].fillna(df_train['Fare'].median())

In [17]:
df_train['CategoricalFare'] = pd.qcut(df_train['Fare'],4)

df_train['CategoricalFare'].value_counts().sort_index()

(-0.001, 7.91]     223
(7.91, 14.454]     224
(14.454, 31.0]     222
(31.0, 512.329]    222
Name: CategoricalFare, dtype: int64

In [18]:
for df in dict_df.values():
    df.loc[df['Fare'] <= 7.91,'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454),'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31.0),'Fare'] = 2
    df.loc[ df['Fare'] > 31.0,'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)

In [19]:
for df in dict_df.values():
    df['Has_Cabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

In [20]:
for df in dict_df.values():
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)

In [21]:
df_train = dict_df['key_df_train']

df_train = df_train.drop(['PassengerId','Name','Ticket','SibSp','Cabin','CategoricalFare','CategoricalAge'],axis = 1)

df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Title,Name_length,FamilySize,IsAlone,Has_Cabin
0,0,3,1,1,0,0,0,1,23,2,0,0
1,1,1,0,2,0,3,1,3,51,2,0,1
2,1,3,0,1,0,1,0,2,22,1,1,0
3,1,1,0,2,0,3,0,3,44,2,0,1
4,0,3,1,2,0,1,0,1,24,1,1,0


In [22]:
df_test = dict_df['key_df_test']

PassengerId = df_test['PassengerId']
df_test = df_test.drop(['PassengerId','Name','Ticket','SibSp','Cabin'],axis = 1)

df_test.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Title,Name_length,FamilySize,IsAlone,Has_Cabin
0,3,1,2,0,0,2,1,16,1,1,0
1,3,0,2,0,0,0,3,32,2,0,0
2,2,1,3,0,1,2,1,25,1,1,0
3,3,1,1,0,1,0,1,16,1,1,0
4,3,0,1,1,1,0,3,44,3,0,0


In [26]:
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain,n_splits = NFOLDS,random_state = SEED)

TypeError: __init__() got multiple values for argument 'n_splits'

In [27]:
class SklearnHelper(object):
    def __init__(self,model,seed = 0,params = None):
        params['random_state'] = seed
        self.model = model(**params)

    def train(self,X_train,y_train):
        self.model.fit(X_train,y_train)
        
    def fit(self,X,y):
        return self.model.fit(X,y)

    def predict(self,X):
        return self.model.predict(X)
    
    def feature_importances(self,X,y):
        print(self.model.fit(X,y).feature_importances_)

In [None]:
def get_oof(model,X_train,y_train,X_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    for i,(train_index,test_index) in enumerate(kf):
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[test_index]
        clf.train(X_tr,y_tr)
        oof_train[test_index] = model.predict(X_te)
        oof_test_skf[i,:] = model.predict(X_test)
    oof_test[:] = oof_test_skf.mean(axis = 0)
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)

In [28]:
rf_params = {'n_jobs':-1,'n_estimators':500,'max_depth':6,'min_samples_leaf':2}

In [None]:
rf = SklearnHelper(model = RandomForestClassifier,seed = SEED,params = rf_params)

In [None]:
X_train = np.array(df_train.drop(['Survived'],axis = 1))
y_train = np.array(df_train['Survived'])
X_test = np.array(df_test)

In [None]:
rf_oof_train,rf_oof_test = get_oof(rf,X_train,y_train,X_test)

In [None]:
base_predictions_train = pd.DataFrame({'RandomForest':rf_oof_train.ravel()}

In [None]:
X_train = np.concatenate((rf_oof_train),axis = 1)

X_test = np.concatenate((rf_oof_train),axis = 1)

In [None]:
gbm = xgb.XGBClassifier(n_estimators = 2000,max_depth = 4).fit(X_train,y_train)
predictions = gbm.predict(X_test)