In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("data/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
### Train test split




vars = [
    "Pclass",
    "Sex",
    "Age",
    "Fare",
    "Name"
]

x = data[vars]
y = data["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

print(x_train.shape)
print(x_test.shape)

(712, 5)
(179, 5)


In [26]:
### Custom transform for cleaning name

from sklearn.base import BaseEstimator, TransformerMixin

class CleaningName(TransformerMixin, BaseEstimator):
    def __init__(
            self,
            name_column
            ):
        self.name_column = name_column

    def fit(self, x):
        return self
    
    def fit_transform(self, x, y = None):
        def _extract_title(text):
            start = text.find(",")
            end = text.find(".")
            return text[start+2:end]
        
        x["title"] = x[self.name_column].apply(_extract_title)

        titles = {
                            "Miss": "Ms",
                            "Mr": "Mr",
                            "Mrs": "Mrs",
                            "Ms": "Ms",
                            "Sir":"Mr",
                            'Mme': "Mrs",
                            "Master": "Academic",
                            "Dr": "Academic",
                            "Rev": "Royalty",
                            "Col": "Royalty",
                            "Major": "Royalty",
                            "Lady": "Ms",
                            "Don": "Mr",
                            "Mlle": "Ms",
                            "Capt": "Royalty",
                            "the Countess": "Royalty",
                            "Jonkheer": "Royalty",
                        }
        x["title"] = x["title"].map(titles)
        x = x.drop(self.name_column, axis = 1)
        return x
    
    def transform(self, X, y = None):
        return self.fit_transform(X)

In [23]:
cleaning_name = CleaningName("Name")
#x_train = cleaning_name.fit_transform(x_train)

In [28]:


numeric_features = ["Age", "Pclass", "Fare"]
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ]
)

categorical_features = ["Sex"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

custom_features = ["Name"]
custom_transformer = Pipeline(
    steps=[
        ("cleaning_name", CleaningName("Name")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("custom", custom_transformer, custom_features)
    ]
)

preprocessor.fit_transform(x_train)

array([[16.    ,  1.    , 57.9792, ...,  0.    ,  1.    ,  0.    ],
       [31.    ,  3.    ,  7.75  , ...,  0.    ,  0.    ,  0.    ],
       [45.5   ,  3.    ,  7.225 , ...,  0.    ,  0.    ,  0.    ],
       ...,
       [32.    ,  3.    ,  7.925 , ...,  0.    ,  0.    ,  0.    ],
       [30.    ,  3.    ,  7.25  , ...,  0.    ,  0.    ,  0.    ],
       [29.    ,  3.    ,  7.75  , ...,  0.    ,  0.    ,  0.    ]],
      shape=(712, 10))

In [36]:
final_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LGBMClassifier(random_state = 123, verbose = -1))
        ]
)


In [37]:
final_model.fit(x_train, y_train)

In [39]:
from sklearn.metrics import accuracy_score
y_pred = final_model.predict(x_test)
accuracy_score(y_test, y_pred)

0.8770949720670391

In [41]:
import pickle
pickle.dump(final_model, open("../src/final_model.pickle", 'wb'))