In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle
warnings.filterwarnings('ignore')

In [132]:
### Load data

data = pd.read_csv("data/train.csv")
#test = pd.read_csv("data/test.csv")

train, test = train_test_split(data, test_size=0.2, random_state=123)
print(train.shape)
print(test.shape)


(712, 12)
(179, 12)


In [78]:
### Check null values

train.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.207865
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.780899
Embarked       0.002809
dtype: float64

In [127]:
### Imputation strategy

#### Mean imputation

mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train[["Age_mean_imputed","Fare_mean_imputed"]] = mean_imputer.fit_transform(train[["Age","Fare"]])
test[["Age_mean_imputed","Fare_mean_imputed"]] = mean_imputer.transform(test[["Age","Fare"]])




In [137]:
### Getting titles from name

class TitleCreation(TransformerMixin, BaseEstimator):
    def __init__(self, column):
        self.column = column
    def fit_transform(self, x: pd.DataFrame, y = None):
        x_copy = x.copy()
        equivalences = {
                        "Mr": "Mr" ,
                        "Mrs": "Mrs",
                        "Miss": "Ms",
                        "Master": "Academic",
                        "Don": "Royalty",
                        "Rev": "Royalty",
                        "Dr": "Academic",
                        "Mme": "Mrs",
                        "Ms": "Ms",
                        "Major" : "Military",
                        "Lady": "Royalty",
                        "Sir": "Royalty",
                        "Mlle": "Ms",
                        "Col": "Military",
                        "Capt": "Military",
                        "the Countess": "Royalty",
                        "Jonkheer": "Royalty"
                    }
        def cleaning_text(text):
            start = text.find(",")
            end = text.find(".")
            return text[start+2:end]
        x_copy[self.column] = x_copy[self.column].apply(cleaning_text).map(equivalences)
        return x_copy[[self.column]]

    def fit(self, x: pd.DataFrame, y = None):
        return self
    def transform(self, x: pd.DataFrame, y = None):
        return self.fit_transform(x)


transformer = TitleCreation("Name")
#train["Title"] = transformer.fit_transform(train)
#test["Title"] = transformer.transform(test)
transformer.fit_transform(train)

Unnamed: 0,Name
329,Ms
749,Mr
203,Mr
421,Mr
97,Mr
...,...
98,Mrs
322,Ms
382,Mr
365,Mr


In [81]:
### Handling categorical variables

enc_sex = OneHotEncoder(handle_unknown="ignore")
train_sex_cat = pd.DataFrame(
                                enc_sex.fit_transform(train[["Sex"]]).toarray(),
                                columns = enc_sex.categories_[0],
                                index = train.index
                            )

test_sex_cat = pd.DataFrame(
                                enc_sex.transform(test[["Sex"]]).toarray(),
                                columns = enc_sex.categories_[0],
                                index = test.index
                            )

enc_title = OneHotEncoder(handle_unknown="ignore")
train_title_cat = pd.DataFrame(
                                enc_title.fit_transform(train[["title"]]).toarray(),
                                columns = enc_title.categories_[0],
                                index = train.index

                                )

test_title_cat = pd.DataFrame(
                                enc_title.transform(test[["title"]]).toarray(),
                                columns = enc_title.categories_[0],
                                index = test.index

                                )

train_final = pd.concat([train, train_sex_cat, train_title_cat], axis = 1)
test_final = pd.concat([test, test_sex_cat, test_title_cat], axis = 1)
train_final.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_mean_imputed,title,female,male,Academic,Military,Mr,Mrs,Ms,Royalty
329,330,1,1,"Hippach, Miss. Jean Gertrude",female,16.0,0,1,111361,57.9792,...,57.9792,Ms,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
749,750,0,3,"Connaghton, Mr. Michael",male,31.0,0,0,335097,7.75,...,7.75,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
203,204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,...,7.225,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
421,422,0,3,"Charters, Mr. David",male,21.0,0,0,A/5. 13032,7.7333,...,7.7333,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,...,63.3583,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [82]:
test_final.isna().mean()

PassengerId          0.000000
Survived             0.000000
Pclass               0.000000
Name                 0.000000
Sex                  0.000000
Age                  0.162011
SibSp                0.000000
Parch                0.000000
Ticket               0.000000
Fare                 0.000000
Cabin                0.731844
Embarked             0.000000
Age_mean_imputed     0.000000
Fare_mean_imputed    0.000000
title                0.000000
female               0.000000
male                 0.000000
Academic             0.000000
Military             0.000000
Mr                   0.000000
Mrs                  0.000000
Ms                   0.000000
Royalty              0.000000
dtype: float64

In [88]:
### Simple machine learning

vars = [
    "Fare_mean_imputed",
    "SibSp",
    "Parch",
    "Pclass",
    "Age_mean_imputed",
    "female",
    "male",
    "Academic",
    "Military",
    "Mr",
    "Mrs",
    "Ms",
    "Royalty"
]

target = ["Survived"]
model = LogisticRegression()
model_rf = RandomForestClassifier(random_state=123)
model.fit(train_final[vars], train_final[target])
model_rf.fit(train_final[vars], train_final[target])

In [84]:
test_final

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Fare_mean_imputed,title,female,male,Academic,Military,Mr,Mrs,Ms,Royalty
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,...,11.1333,Ms,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
524,525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,...,7.2292,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
452,453,0,1,"Foreman, Mr. Benjamin Laventall",male,30.0,0,0,113051,27.7500,...,27.7500,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
170,171,0,1,"Van der hoef, Mr. Wyckoff",male,61.0,0,0,111240,33.5000,...,33.5000,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
620,621,0,3,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,...,14.4542,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,...,7.7292,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
338,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.0500,...,8.0500,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
827,828,1,2,"Mallet, Master. Andre",male,1.0,0,2,S.C./PARIS 2079,37.0042,...,37.0042,Academic,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
773,774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.2250,...,7.2250,Mr,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [89]:
### Prediction

preds = model.predict(test_final[vars])
preds_rf = model_rf.predict(test_final[vars])
accuracy = accuracy_score(test_final[target], preds)
accuracy_rf = accuracy_score(test_final[target], preds_rf)
print(accuracy)
print(accuracy_rf)

0.8491620111731844
0.8212290502793296


In [147]:
### Preprocessing as pipelines



numeric_features = ["Fare", "Age", "SibSp", "Parch", "Pclass"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean"))]
)

categorical_features = ["Sex"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

to_be_cleaned_features = ["Name"]
cleaning_transformer = Pipeline(
    steps=[
        ("clean", TitleCreation("Name")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[

        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("clean", cleaning_transformer, to_be_cleaned_features),

    ]
)

preprocessor.fit_transform(train)

final_solution = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

final_solution.fit(train_final, train_final["Survived"])

In [148]:
final_solution.predict(test)

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0])

In [149]:
new_data = pd.read_csv("data/new_data.csv")
preprocessor.transform(new_data)
final_solution.predict(new_data)

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1])

In [152]:
## Export as pickle

pickle.dump(final_solution, open("../src/my_final_solution.pickle", 'wb'))
