In [973]:
import os
import urllib.request
import numpy as np

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_information(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(TITANIC_PATH):
        os.makedirs(TITANIC_PATH)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            urllib.request.urlretrieve(DOWNLOAD_URL+filename, filepath)

fetch_titanic_information()

In [974]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [975]:
#Set the index to passenger ID instead

train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

train_data = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Ticket', 'Cabin'], axis=1)

In [976]:
from sklearn.impute import SimpleImputer
titanic_train = train_data.copy()
titanic_test = test_data.copy()

In [977]:
titanic_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [978]:
data_num = titanic_train.drop(["Name", "Sex", "Embarked"], axis=1)
test_num = titanic_test.drop(["Name", "Sex", "Embarked"], axis=1)
data_cat = titanic_train.drop(["Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"], axis=1)
test_cat = titanic_test.drop(["Pclass", "Age", "SibSp", "Parch", "Fare"], axis=1)
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(data_num)
data_num = pd.DataFrame(X, columns=data_num.columns, index=data_num.index)

In [979]:
def age_bucket(data_num):
    data_num["age_cat"] = pd.cut(data_num["Age"], bins=[0, 15, 30, 45, 60, 70, np.inf], labels=[1,2,3,4,5,6])
    data_num.drop("Age", axis=1, inplace=True)
    data_num["age_cat"] = pd.to_numeric(data_num["age_cat"])
    return data_num

def fare_bucket(data_num):
    data_num["fare_bucket"] = pd.cut(data_num["Fare"], bins=[0., 7.91, 14.45, 31.0, 512.32, np.inf], labels=[1,2,3,4,5])
    data_num.drop("Fare", axis=1, inplace=True)
    data_num["fare_bucket"] = pd.to_numeric(data_num["fare_bucket"])
    return data_num

In [980]:
data_num = age_bucket(data_num)
data_num = fare_bucket(data_num)
data_prepared = pd.concat([data_num, data_cat], axis=1)

In [981]:
data_prepared.head()

Unnamed: 0_level_0,Survived,Pclass,SibSp,Parch,age_cat,fare_bucket,Name,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3.0,1.0,0.0,2,1.0,"Braund, Mr. Owen Harris",male,S
2,1.0,1.0,1.0,0.0,3,4.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,C
3,1.0,3.0,0.0,0.0,2,2.0,"Heikkinen, Miss. Laina",female,S
4,1.0,1.0,1.0,0.0,3,4.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,S
5,0.0,3.0,0.0,0.0,3,2.0,"Allen, Mr. William Henry",male,S


In [982]:
test_num = age_bucket(test_num)
test_num = fare_bucket(test_num)
test_prepared = pd.concat([test_num, test_cat], axis=1)

In [983]:
data_prepared.describe()

Unnamed: 0,Survived,Pclass,SibSp,Parch,age_cat,fare_bucket
count,891.0,891.0,891.0,891.0,891.0,876.0
mean,0.383838,2.308642,0.523008,0.381594,2.395062,2.534247
std,0.486592,0.836071,1.102743,0.806057,0.888717,1.116102
min,0.0,1.0,0.0,0.0,1.0,1.0
25%,0.0,2.0,0.0,0.0,2.0,2.0
50%,0.0,3.0,0.0,0.0,2.0,3.0
75%,1.0,3.0,1.0,0.0,3.0,4.0
max,1.0,3.0,8.0,6.0,6.0,5.0


Seems to be some null values in some features, we need to construct a pipeline that fill the null values with an Imputer and scale some features with StandardScalar

In [984]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [985]:

sp_index, parch_index = 1, 3

class dataCombiner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        relatives_onboard = X[:, sp_index] + X[:, parch_index] + 1
        for num in relatives_onboard:
            if num > 1:
                num = 0
        final = np.c_[X, relatives_onboard]
        final = np.delete(final, [sp_index, parch_index], 1)
        return final


In [986]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("combiner", dataCombiner()),
    ("scaler", StandardScaler())
])

In [987]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat", OneHotEncoder(handle_unknown='ignore'))
])

In [988]:
from sklearn.compose import ColumnTransformer

num_attributes = ["age_cat", "Parch", "fare_bucket", "SibSp"]
cat_attributes = ["Pclass", "Sex", "Embarked"]

preprocessing_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attributes),
    ("cat", cat_pipeline, cat_attributes)
])

In [989]:
train_prepared = preprocessing_pipeline.fit_transform(data_prepared[num_attributes + cat_attributes])

In [990]:
train_prepared[0].shape

(11,)

In [991]:
y_train = titanic_train["Survived"]

In [992]:
X_test = preprocessing_pipeline.fit_transform(test_prepared[num_attributes + cat_attributes])

In [993]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

cv_method = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)
test = [i for i in range(3,10)]
test_poly = [i for i in range(3,8)]
test_c = [0.1, 0.3, 1, 3, 10, 30]

params = [
    {"kernel": ["rbf", "poly", "linear"], "gamma": ["auto"], "degree": test_poly, "C": test_c, "random_state": [42]}
]
params2 = [
    {"weights" : ['uniform', 'distance'], "n_neighbors" : test}
]
params3 = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 5, 10, 30, 50, 100], 'max_features': [2, 4, 6, 8, 10]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 4, 6, 8, 10]},
]
params4 =[
    {'max_features' :['sqrt'],'max_depth' : [4,5,6,7,8,9,10],'criterion': ['gini','entropy']}
]

gnb = GaussianNB()
svc_clf = SVC()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()


In [994]:
grid_search = GridSearchCV(svc_clf, param_grid=params, cv=10)
grid_search2 = GridSearchCV(knn, param_grid=params2, cv=10)
grid_search3 = GridSearchCV(rf, param_grid=params3, cv=10)
grid_search4 = GridSearchCV(dt, param_grid=params4, cv=10)

In [995]:
grid_search.fit(train_prepared, y_train)
grid_search2.fit(train_prepared, y_train)
grid_search3.fit(train_prepared, y_train)
grid_search4.fit(train_prepared, y_train)

In [996]:
svm_clf = SVC(**grid_search.best_params_)
knn = KNeighborsClassifier(**grid_search2.best_params_)
rf = RandomForestClassifier(**grid_search3.best_params_)
dt = DecisionTreeClassifier(**grid_search4.best_params_)

forest_score = cross_val_score(rf, train_prepared, y_train, cv=10)
dt_score = cross_val_score(dt, train_prepared, y_train, cv=10)
svm_scores = cross_val_score(svm_clf, train_prepared, y_train, cv=10)
knn_scores = cross_val_score(knn, train_prepared, y_train, cv=10)
gnb_scores = cross_val_score(gnb, train_prepared, y_train, cv=10)

print(svm_scores.mean())
print(knn_scores.mean())
print(gnb_scores.mean())
print(dt_score.mean())
print(forest_score.mean())

0.8316729088639201
0.8271910112359551
0.7677278401997503
0.813732833957553
0.8137827715355804


In [1000]:
rf.fit(train_prepared, y_train)
answer = np.reshape(rf.predict(X_test),(418,1))
passenger_id = [i for i in range(892, 892+answer.shape[0])]
arr = np.reshape(np.transpose(np.array(passenger_id)), (418,1))
answer = np.append(arr, answer, axis=1)

In [1001]:
answer_pd = pd.DataFrame(answer, columns=["PassengerId", "Survived"], index=None)

In [1002]:
answer_pd.to_csv('answers3.csv', index=False)