In [70]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
#import plotly.express as px

from pandas_profiling import ProfileReport
from xgboost import XGBClassifier
from xgboost import plot_importance

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

## Import helper functions
from ipynb.fs.defs.utils import *

In [71]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

In [72]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

In [73]:
TARGET = ["Survived"]

# Load Data

In [254]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [255]:
## Bring in Age Predictions

In [256]:
## Creata a combined data set to ensure train + test get same pre-proccessing
train['label'] = 'train'
test['label'] = 'test'
combined = train.append(test)

# Pre-Processing

In [107]:
combined = nlp_transforms(combined) # name features
combined = ticket_transforms(combined) # ticket transforms

In [108]:
combined.Nickname = np.where(combined.Nickname.isnull() == True, 0,1)

In [109]:
combined.Junior = np.where(combined.Junior.isnull() == True, 0,1)

In [110]:
combined.Cabin = combined.Cabin.astype(str)
combined.Cabin = combined.Cabin.apply(lambda x: x[0])

In [111]:
combined['family_size'] = combined.SibSp + combined.Parch + 1

## Define Features and Target
Define categorical and numeric features manually because sometimes cols of type float/int should be categorical and vice versa

In [112]:
combined.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'label', 'First Name',
       'Title', 'Nickname', 'Last Name', 'Junior', 'Middle Names',
       'Ticket Number', 'Ticket Type', 'family_size'],
      dtype='object')

In [153]:
## helper to build feature list
all_categorical_features = ['Pclass','Name','Sex','Ticket','Cabin','Embarked','Title','Junior','Nickname','Ticket Type','family_size','Cabin']
all_numeric_features = ['Age','SibSp','Parch','Fare','Ticket Number']
to_drop = ["PassengerId","Name","Last Name","Survived","Ticket","Ticket Number", "Middle Names","label","First Name","SibSp","Parch"]

In [154]:
features,categorical_features,numeric_features = feature_selection(to_drop,all_categorical_features,all_numeric_features)

In [155]:
X = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
y = combined[combined['label'] == 'train'][TARGET]
test = combined[combined['label'] == 'test'].drop(to_drop, axis=1)

# Create Pipeline

In [247]:
def split(data, to_drop, submission=False):
    if submission == True:
        print("Submission")
        X_train = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
        y_train = combined[combined['label'] == 'train'][TARGET]
        X_test = combined[combined['label'] == 'test'].drop(to_drop, axis=1)
        y_test = [0,0,0]
    else:
        X = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
        y = combined[combined['label'] == 'train'][TARGET]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
    
    numeric_transformer = Pipeline(steps=[
                            ('imputer', SimpleImputer(strategy='median')),
                            ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                #('onehot', OneHotEncoder(handle_unknown='ignore'))
                                ('ordinal', OrdinalEncoder(unknown_value=np.nan, handle_unknown='use_encoded_value'))
                                ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])


    classifier = XGBClassifier(
                    learning_rate = 0.01,
                     n_estimators= 100,
                     max_depth= 4,
                     min_child_weight= 1,
                     gamma=0.9,                        
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread= -1,
                     use_label_encoder=False,
                     scale_pos_weight=1)

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',classifier)])

    pipe.fit(
        X_train,
        y_train,
        classifier__eval_metric="logloss",
       )
    
    y_pred = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)
    
    if submission == False: 
        print(metrics.log_loss(y_test, y_pred))
        print(metrics.accuracy_score(y_test, y_pred))
    
    return y_pred

In [248]:
y_pred = split(combined,to_drop,submission=True)

Submission


In [258]:
test = pd.read_csv(TEST_PATH)
test['prediction'] = y_pred

In [263]:
submit = test[['PassengerId','prediction']]

In [None]:
#submit.to_csv('data/submission.csv', index=False)
#! kaggle competitions submit -c sliced-s01e03-DcSXes -f 'submission.csv' -m "No feature engineering"