# Titanic Model Simple

#### Retrieve the dependencies

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score

#### Load the data

In [2]:
df = pd.read_csv('train.csv')
X = df.drop('Survived', axis=1) 
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1, random_state=1234) 

#### Preprocess the data

In [3]:
class PrepProcesor(BaseEstimator, TransformerMixin): 
    def fit(self, X, y=None): 
        self.ageImputer = SimpleImputer()
        self.ageImputer.fit(X[['Age']])        
        return self 
        
    def transform(self, X, y=None):
        X['Age'] = self.ageImputer.transform(X[['Age']])
        X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
        X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0) 
        X['Embarked'] = X['Embarked'].fillna('M')
        X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
        return X

In [4]:
preproc = PrepProcesor()
numeric_pipeline = Pipeline([('Scaler', StandardScaler())]) 
categorical_pipeline = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer([('num', numeric_pipeline, ['Pclass','Age','SibSp','Parch','Fare','CabinNumber']), ('cat', categorical_pipeline, ['Sex', 'Embarked'])])

In [5]:
mlpipe = Pipeline([('InitialPreproc', PrepProcesor()), ('Transformer',transformer), ('xgb', XGBClassifier())])

In [6]:
mlpipe.fit(X_train,y_train)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinClass,CabinNumber
267,268,3,"Persson, Mr. Ernst Ulrik",male,25.000000,1,0,347083,7.7750,,S,M,0
635,636,2,"Davis, Miss. Mary",female,28.000000,0,0,237668,13.0000,,S,M,0
473,474,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.000000,0,0,SC/AH Basle 541,13.7917,D,C,D,0
207,208,3,"Albimona, Mr. Nassef Cassem",male,26.000000,0,0,2699,18.7875,,C,M,0
290,291,1,"Barber, Miss. Ellen ""Nellie""",female,26.000000,0,0,19877,78.8500,,S,M,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,205,3,"Cohen, Mr. Gurshon ""Gus""",male,18.000000,0,0,A/5 3540,8.0500,,S,M,0
53,54,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin...",female,29.000000,1,0,2926,26.0000,,S,M,0
294,295,3,"Mineff, Mr. Ivan",male,24.000000,0,0,349233,7.8958,,S,M,0
723,724,2,"Hodges, Mr. Henry Price",male,50.000000,0,0,250643,13.0000,,S,M,0


In [8]:
yhat = mlpipe.predict(X_test) 

precision_score(y_test, yhat) 

0.8108108108108109

In [9]:
import joblib

In [10]:
joblib.dump(mlpipe, 'xgbpipe.joblib') 

['xgbpipe.joblib']

In [11]:
model = joblib.load('xgbpipe.joblib')

In [12]:
test = pd.read_csv('test.csv')

In [14]:
yhat = model.predict(test)
yhat


array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,