In [145]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import StratifiedKFold
import string
import warnings
import numpy as np
import pickle
SEED = 42

In [2]:
train = pd.read_csv('data/train.csv', index_col=0)

In [3]:
test = pd.read_csv('data/test.csv', index_col=0)

In [174]:
class DataTransformer():
    
    def fit(self, dataset):
        self.fare_median = dataset['Fare'].median()
        self.age_median = dataset['Age'].median()
        self.embarked_most_frecuent = dataset['Embarked'].mode()[0]
        
        self.apply_inputations(dataset)
            
        self.fare_intervals = pd.qcut(dataset['Fare'], 4, precision=4).unique()
        
        self.age_intervals = pd.cut(dataset['Age'], 5).unique()
            
        self.calculate_new_features(dataset)
            
        self.encoders = self.get_encoders(dataset)
        
        return self
        
    def apply_inputations(self, dataset):
        feature_inputations = self.get_feature_inputations()
        for feature in feature_inputations:
            feature_inputations[feature](dataset)

            
    def calculate_new_features(self, dataset):
        new_features = self.get_new_features()
        for feature in new_features:
            dataset[feature] = new_features[feature](dataset)


    def get_encoders(self, dataset):
        
        dataset.to_pickle('dataset.pkl')

        categorical_variables = ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']

        encoders = {}
        for cat in categorical_variables:
            encoder = LabelEncoder().fit(dataset[cat])
            encoders[cat] = encoder

        return encoders
        
        
    def get_feature_inputations(self):
        age_inputation_fn = lambda dataset : dataset['Age'].fillna(self.fare_median, inplace = True)

        embarked_inputation_fn = lambda dataset : dataset['Embarked'].fillna(self.embarked_most_frecuent, inplace = True)

        fare_inputation_fn = lambda dataset : dataset['Fare'].fillna(self.age_median, inplace = True)

        feature_inputation = {}
        feature_inputation['Age'] = age_inputation_fn
        feature_inputation['Embarked'] = embarked_inputation_fn
        feature_inputation['Fare'] = fare_inputation_fn
        
        return feature_inputation
    
    def get_interval(self, value, intervals):
        for interval in intervals:
            if value in interval:
                return interval
        else:
            print(value)
            
    
    def get_new_features(self):
        
        family_size_feature_fn = lambda dataset : dataset ['SibSp'] + dataset['Parch'] + 1

        is_alone_feature_fn = lambda dataset : (dataset.FamilySize == 1).astype(int)
        
        is_married_fn = lambda dataset : (dataset.Title == 'Mrs').astype(int)

        title_feature_fn = lambda dataset : dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

        fare_bin_feature_fn = lambda dataset : dataset.Fare.apply(lambda fare: self.get_interval(fare, self.fare_intervals))

        age_bin_feature_fn = lambda dataset : dataset.Age.apply(lambda age : self.get_interval(age, self.age_intervals))

        new_features = {}
        new_features['FamilySize'] = family_size_feature_fn
        new_features['IsAlone'] = is_alone_feature_fn
        new_features['Title'] = title_feature_fn
        new_features['FareBin'] = fare_bin_feature_fn
        new_features['AgeBin'] = age_bin_feature_fn
        new_features['IsMarried'] = is_married_fn
        
        return new_features
    
    
    def transform(self, df):
        
        dataset = df.copy(deep=True)
        
        self.apply_inputations(dataset)
            
        self.calculate_new_features(dataset)
            
        dataset['Sex_Code'] = self.encoders['Sex'].transform(dataset['Sex'])
        dataset['Embarked_Code'] = self.encoders['Embarked'].transform(dataset['Embarked'])
        dataset['Title_Code'] = self.encoders['Title'].transform(dataset['Title'])
        dataset['AgeBin_Code'] = self.encoders['AgeBin'].transform(dataset['AgeBin'])
        dataset['FareBin_Code'] = self.encoders['FareBin'].transform(dataset['FareBin'])
        
        features = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'AgeBin_Code', 'FareBin_Code', 'FamilySize', 'IsAlone', 'IsMarried'] 
        
        return pd.get_dummies(dataset[features])
    
    
    def save(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load(cls, filename):
        with open(filename, "rb") as f:
            return pickle.load(f)
        

In [175]:
transformer = DataTransformer().fit(pd.concat([train,test]))

In [176]:
transformer.save('transformer.pkl')

In [177]:
transformer = DataTransformer.load('transformer.pkl')

In [178]:
X_train = transformer.transform(train)
y_train = train.Survived

In [167]:
model = RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=SEED,
                                           n_jobs=-1,
                                           verbose=1) 

In [168]:
N = 5
oob = 0
probs = pd.DataFrame(np.zeros((len(X_test), N * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, N + 1) for j in range(2)])
importances = pd.DataFrame(np.zeros((X_train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=dataset.columns)
fprs, tprs, scores, acc_scores = [], [], [], []

skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print('Fold {}\n'.format(fold))
    
    # Fitting the model
    model.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
    
    # Computing Train AUC score
    trn_fpr, trn_tpr, trn_thresholds = roc_curve(y_train.iloc[trn_idx], model.predict_proba(X_train.iloc[trn_idx])[:, 1])
    trn_auc_score = auc(trn_fpr, trn_tpr)
    # Computing Validation AUC score
    val_fpr, val_tpr, val_thresholds = roc_curve(y_train.iloc[val_idx], model.predict_proba(X_train.iloc[val_idx])[:, 1])
    val_auc_score = auc(val_fpr, val_tpr)  
      
    scores.append(val_auc_score)
    fprs.append(val_fpr)
    tprs.append(val_tpr)
    
    # X_test probabilities
    probs.loc[:, 'Fold_{}_Prob_0'.format(fold)] = model.predict_proba(X_test)[:, 0]
    probs.loc[:, 'Fold_{}_Prob_1'.format(fold)] = model.predict_proba(X_test)[:, 1]
    importances.iloc[:, fold - 1] = model.feature_importances_
    
    acc_scores.append(accuracy_score(y_train.iloc[val_idx], model.predict(X_train.iloc[val_idx])))
        
    oob += model.oob_score_ / N
    print('Fold {} OOB Score: {}\n'.format(fold, model.oob_score_))   
    
print('Average OOB Score: {}'.format(oob))

Fold 1



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    6.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 1 OOB Score: 0.8398876404494382

Fold 2



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    8.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 2 OOB Score: 0.8288920056100981

Fold 3



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    8.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 3 OOB Score: 0.8330995792426368

Fold 4



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    8.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 4 OOB Score: 0.8176718092566619

Fold 5



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    8.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    1.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent worker

Fold 5 OOB Score: 0.8120617110799438

Average OOB Score: 0.8263225491277558


[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 1750 out of 1750 | elapsed:    0.9s finished


In [169]:
auc_mean = np.mean(scores)

In [170]:
acc_mean = np.mean(acc_scores)

In [171]:
auc_mean

0.8708125341734766

In [172]:
acc_mean

0.8249513527085556