In [152]:
!kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [153]:
from zipfile import ZipFile
with ZipFile('titanic.zip', 'r') as zipObj:
    zipObj.extractall()

In [275]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

In [276]:
class Explorer:
    def __init__(self):
        pass
    
    @staticmethod
    def describe(df):
        nunique, nunique.name = df.nunique(), 'nunique'
        dtypes, dtypes.name = df.dtypes, 'dtypes'
        isna, isna.name = df.isna().sum(), 'isna'
        length = pd.concat([train[col].apply(lambda x: len(str(x))) for col in train.columns],
                           axis=1)
        max_length, max_length.name = length.max(), 'max_length'
        min_length, min_length.name = length.min(), 'min_length'
        return pd.concat([nunique, dtypes, isna, min_length, max_length], axis=1, sort=True)
    
    def show_dist(self, df, cat, num, agg):
        df.groupby(cat)[num].agg(agg)

In [277]:
train.groupby('Sex')['Age'].agg('mean')

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [278]:
explorer = Explorer()

In [279]:
explorer.describe(train)

Unnamed: 0,nunique,dtypes,isna,min_length,max_length
Age,88,float64,177,3,4
Cabin,147,object,687,1,15
Embarked,3,object,2,1,3
Fare,248,float64,0,3,8
Name,891,object,0,12,82
Parch,7,int64,0,1,1
PassengerId,891,int64,0,1,3
Pclass,3,int64,0,1,1
Sex,2,object,0,4,6
SibSp,7,int64,0,1,1


In [280]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [281]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder
from copy import deepcopy
from pdb import set_trace as breakpoint

class Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, impute_strategy='mean', cat_features=None, num_features=None):
        self.imputer = SimpleImputer(strategy=impute_strategy)
        self.missing_indicator = MissingIndicator()
        self.cat_features = cat_features
        self.num_features = num_features


    def fit_transform(self, X_train):
        X_train = deepcopy(X_train)
        X_train['Name'] = X_train['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
#         grp = X_train.groupby(['Sex', 'Pclass'])
#         grp['Age'].apply(lambda x: x.fillna(x.median()))
        X_train['Fare_Category'] = pd.cut(X_train['Fare'],
                                          bins=[0,7.90,14.45,31.28,120],
                                          labels=['Low','Mid', 'High_Mid','High'])
        breakpoint()
        self.detect_features(X_train)
        X_train[self.num_features] = self.imputer.fit_transform(X_train[self.num_features]).copy()
        X_train = pd.get_dummies(X_train, columns=self.cat_features)
        self.columns = X_train.columns
        return X_train


    def transform(self, X_test):
        X_test = deepcopy(X_test)
        X_test['Name'] = X_test['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
        X_test['Fare_Category'] = pd.cut(X_test['Fare'],
                                         bins=[0,7.90,14.45,31.28,120],
                                         labels=['Low','Mid', 'High_Mid','High'])
        X_test[self.num_features] = self.imputer.transform(X_test[self.num_features])
        X_test = pd.get_dummies(X_test, columns=self.cat_features)
        X_test = self.align(X_test)
        return X_test


    def align(self, X_test):
        X_test = deepcopy(X_test)
        cols_to_drop = list(set(X_test.columns) - set(self.columns))
        cols_to_add = list(set(self.columns) - set(X_test.columns))
        X_test = X_test.drop(cols_to_drop, axis=1)
        X_test = X_test.assign(**dict(zip(cols_to_add, [0] * len(cols_to_add))))
        X_test = X_test[self.columns]
        return X_test


    def detect_features(self, X_train):
        self.num_features = [col for col in X_train.columns if X_train[col].dtype in (np.int64, np.float64)]
        self.cat_features = [col for col in X_train.columns if col not in self.num_features]


In [283]:
# transformer.imputer

In [284]:
# transformer.imputer.fit_transform(train[transformer.num_features])

In [285]:
y = train['Survived']
X = train.drop('Survived', axis=1)
X = X.set_index('PassengerId')
X_test = test.set_index('PassengerId')

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y)

transformer = Transformer()
X_train = transformer.fit_transform(X_train)
# X_val = transformer.transform(X_val)
X_test = transformer.transform(X_test)
X = transformer.transform(X)

> <ipython-input-281-2e3f4288099d>(24)fit_transform()
-> self.detect_features(X_train)
(Pdb) c


In [286]:
list(map(lambda x: x.shape, [X, X_train, X_test]))

[(891, 672), (668, 672), (418, 672)]

In [215]:
rf = RandomForestClassifier()

In [164]:
rf.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [165]:
from xgboost import XGBClassifier

In [287]:
xgb = XGBClassifier()

In [288]:
cross_val_score(xgb, X, y)



array([0.80808081, 0.83501684, 0.84175084])

In [289]:
xgb.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [306]:
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold

In [308]:
random_state = 42
n_splits = 5

kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

param_grid = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200, 400]
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kfold, scoring="neg_log_loss",
                  verbose=True, return_train_score=True, n_jobs=-1)

In [None]:
gs.fit(X, y)

Fitting 5 folds for each of 1620 candidates, totalling 8100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.6min


In [290]:
gender_submission['Survived'] = xgb.predict(X_test)
gender_submission.to_csv('submission.csv', index=False)

In [291]:
!kaggle competitions submit -c titanic -f submission.csv -m "Message"

100%|████████████████████████████████████████| 2.77k/2.77k [00:04<00:00, 581B/s]
Successfully submitted to Titanic: Machine Learning from Disaster