In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import StratifiedKFold
import string
import warnings
import numpy as np
import pickle
SEED = 42

In [2]:
train = pd.read_csv('data/train.csv', index_col=0)

In [3]:
test = pd.read_csv('data/test.csv', index_col=0)

In [142]:
class DataTransformer:

    def __init__(self):
        self.fare_median = None
        self.age_median = None
        self.embarked_most_frequent = None
        self.fare_intervals = None
        self.age_intervals = None
        self.encoders = None

    def fit(self, dataset):
        self.fare_median = dataset['Fare'].median()
        self.age_median = dataset['Age'].median()
        self.embarked_most_frequent = dataset['Embarked'].mode()[0]

        self.apply_imputations(dataset)

        self.fare_intervals = pd.qcut(dataset['Fare'], 4, precision=4).unique()

        self.age_intervals = pd.cut(dataset['Age'], 5).unique()

        self.calculate_new_features(dataset)

        self.encoders = self.get_encoders(dataset)
        
        self.apply_label_encoders(dataset)
        
        self.onehot_encoders = self.get_onehot_encoders(dataset)

        return self

    def apply_imputations(self, dataset):
        feature_imputations = self.get_feature_imputations()
        for feature in feature_imputations:
            feature_imputations[feature](dataset)

    def calculate_new_features(self, dataset):
        new_features = self.get_new_features()
        for feature in new_features:
            dataset[feature] = new_features[feature](dataset)

    @staticmethod
    def get_encoders(dataset):

        categorical_variables = ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']

        encoders = {}
        for cat in categorical_variables:
            encoder = LabelEncoder().fit(dataset[[cat]])
            encoders[cat] = encoder
 
        return encoders

    
    def get_onehot_encoders(self, dataset):
        categories = ['Sex_Code', 'Embarked_Code', 'Title_Code']
        
        onehot_encoders = {}
        
        for cat in categories:
            encoder = OneHotEncoder(sparse=False).fit(dataset[[cat]])
            onehot_encoders[cat] = encoder
        return onehot_encoders

    def get_feature_imputations(self):
        age_inputation_fn = lambda dataset: dataset['Age'].fillna(self.age_median, inplace=True)

        embarked_inputation_fn = lambda dataset: dataset['Embarked'].fillna(self.embarked_most_frequent, inplace=True)

        fare_inputation_fn = lambda dataset: dataset['Fare'].fillna(self.fare_median, inplace=True)

        feature_inputation = {'Age': age_inputation_fn, 'Embarked': embarked_inputation_fn, 'Fare': fare_inputation_fn}

        return feature_inputation

    @staticmethod
    def get_interval(value, intervals):
        for interval in intervals:
            if value in interval:
                return interval
        else:
            print(value)

    def get_new_features(self):

        family_size_feature_fn = lambda dataset: dataset['SibSp'] + dataset['Parch'] + 1

        is_alone_feature_fn = lambda dataset: (dataset.FamilySize == 1).astype(int)

        is_married_fn = lambda dataset: (dataset.Title == 'Mrs').astype(int)

        title_feature_fn = lambda dataset: dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

        fare_bin_feature_fn = lambda dataset: dataset.Fare.apply(
            lambda fare: self.get_interval(fare, self.fare_intervals))

        age_bin_feature_fn = lambda dataset: dataset.Age.apply(lambda age: self.get_interval(age, self.age_intervals))

        new_features = {'FamilySize': family_size_feature_fn, 'IsAlone': is_alone_feature_fn, 'Title': title_feature_fn,
                        'FareBin': fare_bin_feature_fn, 'AgeBin': age_bin_feature_fn, 'IsMarried': is_married_fn}

        return new_features
    
    def apply_label_encoders(self, dataset):
        dataset['Sex_Code'] = self.encoders['Sex'].transform(dataset['Sex'])
        dataset['Embarked_Code'] = self.encoders['Embarked'].transform(dataset['Embarked'])
        dataset['Title_Code'] = self.encoders['Title'].transform(dataset['Title'])
        dataset['AgeBin_Code'] = self.encoders['AgeBin'].transform(dataset['AgeBin'])
        dataset['FareBin_Code'] = self.encoders['FareBin'].transform(dataset['FareBin'])
        

    def transform(self, df):
        dataset = df.copy(deep=True)

        self.apply_imputations(dataset)

        self.calculate_new_features(dataset)


        self.apply_label_encoders(dataset)
        
        
        onehot_sex = self.onehot_encoders['Sex_Code'].transform(dataset[['Sex_Code']])
        onehot_embarked = self.onehot_encoders['Embarked_Code'].transform(dataset[['Embarked_Code']])
        onehot_title = self.onehot_encoders['Title_Code'].transform(dataset[['Title_Code']])
        
        onehot_features = pd.DataFrame(data=np.concatenate([onehot_sex, onehot_embarked, onehot_title], axis=1), index=dataset.index)
        
        features = ['Pclass', 'SibSp', 'Parch', 'AgeBin_Code', 'FareBin_Code', 'FamilySize','IsAlone', 'IsMarried']

        return dataset[features].join(onehot_features)

    def save(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load(cls, filename):
        with open(filename, "rb") as f:
            return pickle.load(f)

In [143]:
transformer = DataTransformer().fit(pd.concat([train,test]))

  y = column_or_1d(y, warn=True)


In [144]:
X_train = transformer.transform(train)
y_train = train.Survived

In [146]:
model = RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=SEED,
                                           n_jobs=-1,
                                           verbose=1) 

In [148]:
N = 5
oob = 0
fprs, tprs, scores, acc_scores = [], [], [], []

skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print('Fold {}\n'.format(fold))
    
    # Fitting the model
    model.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
    
    # Computing Train AUC score
    trn_fpr, trn_tpr, trn_thresholds = roc_curve(y_train.iloc[trn_idx], model.predict_proba(X_train.iloc[trn_idx])[:, 1])
    trn_auc_score = auc(trn_fpr, trn_tpr)
    # Computing Validation AUC score
    val_fpr, val_tpr, val_thresholds = roc_curve(y_train.iloc[val_idx], model.predict_proba(X_train.iloc[val_idx])[:, 1])
    val_auc_score = auc(val_fpr, val_tpr)  
      
    scores.append(val_auc_score)
    fprs.append(val_fpr)
    tprs.append(val_tpr)
    
    # X_test probabilities
    
    acc_scores.append(accuracy_score(y_train.iloc[val_idx], model.predict(X_train.iloc[val_idx])))
        
    oob += model.oob_score_ / N
    print('Fold {} OOB Score: {}\n'.format(fold, model.oob_score_))   
    
print('Average OOB Score: {}'.format(oob))

Fold 1



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 1 OOB Score: 0.8356741573033708

Fold 2



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 2 OOB Score: 0.8260869565217391

Fold 3



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 3 OOB Score: 0.8387096774193549

Fold 4



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 4 OOB Score: 0.8260869565217391

Fold 5



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 5 OOB Score: 0.8092566619915849

Average OOB Score: 0.8271628819515576


[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.3s finished


In [149]:
auc_mean = np.mean(scores)

In [150]:
acc_mean = np.mean(acc_scores)

In [151]:
auc_mean

0.8677726536819899

In [152]:
acc_mean

0.8249513527085556