# Content

<font color="blue"></font>

1. [Load Data](#1)
2. [Variable Description](#2)
    * [Univariate Variable Analysis](#3)
        * [Categorical Variable](#4)
        * [Numerical Variable](#5)
3. [Basic Data Analysis](#6)
4. [Outlier Detection](#7)
5. [Missing Value](#8)
    * [Find Missing Value](#9)
    * [Fill Missing Value](#10)
6. [Visualization](#11)
    * [Correlation Between Sibsp -- Parch -- Age -- Fare -- Survived](#12)
    * [SibSp -- Survived](#13)
    * [Parch -- Survived](#14)
    * [Pclass -- Survived](#15)
    * [Age -- Survived](#16)
    * [Pclass -- Survived -- Age](#17)
    * [Embarked -- Sex -- Pclass -- Survived](#18)
    * [Embarked -- Sex -- Fare -- Survived](#19)
    * [Fill Missing: Age Feature](#20)
7. [Feature Engineering](#21)
    * [Name-Title](#22)
    * [Family Size](#23)
    * [Embarked](#24)
    * [Ticket](#25)
    * [Pclass](#26)
    * [Sex](#27)
    * [Cabin](#28)
    * [Drop Passenger ID](#29)
8. [Modeling](#30)
    * [Train-Test Split](#31)
    * [Simple Logistic Regression](#32)
    * [Hyperparameter Tuning -- Grid Search -- Cross Validation](#33)
    * [Ensemble Modeling](#34)
9. [Prediction and Submission](#35)
    

In [535]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
#plt.style.use('seaborn-whitegrid')

import seaborn as sns

from collections import Counter
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='1'></a>
# Load Data

In [536]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
test_PassengerId = test_df["PassengerId"]

In [537]:
# Observing the features that belong to train dataset.
train_df.columns

In [538]:
# Having an insigth using head function
train_df.head()

In [539]:
train_df.describe()

<a id='2'></a>
# Variable Description

1. PassengerId : unique id number to each passenger
2. Survived : passenger survive(1) or died(0)
3. Pclass : passenger class
4. Name : name
5. Sex : gender of passenger
6. SibSp : number of siblings/spouses
7. Parch :  number of parents/children
8. Ticket : ticket number
9. Fare : amount of money spent on ticket
10. Cabin : cabin category
11. Embarked: port where passenger embarked (C = Charbırg, Q = Queensten, S = Southampton)

In [540]:
train_df.info()

* float64 (2) : Age, Fare
* int64 (5) : PassengerId, Survived, Pclass, SibSp, Parch
* object (5) : Name, Sex, Ticket, Cabin, Embarked

<a id='3'></a>
## Univariate Variable Analysis
 
* Categorical Variable : Survived, Sex, Pclass, Embarked, Cabin, Name, Ticket, Sibsp and Parch
* Numerical Variable : Fare, age and passengerId


<a id='4'></a>
### Categorical Variable



In [541]:
def bar_plot(variable):
    
    #get feature
    var = train_df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize=(9,3))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel('frequency')
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [542]:
category1 = ["Survived","Sex","Pclass","Embarked","SibSp","Parch"]
for c in category1:
    bar_plot(c)

In [543]:
category2 = ["Cabin","Name","Ticket"]
for c in category2:
    print("{} \n".format(train_df[c].value_counts()))

<a id='5'></a>
### Numerical Variable

In [544]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(train_df[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [545]:
# PassengeId is not required for being observed
#numericVar = ["Fare","Age","PassengerId"]
numericVar = ["Fare","Age"]
for n in numericVar:
    plot_hist(n)

<a id='6'></a>
# Basic Data Analysis

* Pclass - Survived
* Sex - Survived
* SibSp - Survived
* Parch - Survived

In [546]:
# Pclass - Survived

train_df[["Pclass","Survived"]].groupby(["Pclass"],as_index=False).mean().sort_values("Survived",ascending=False)

In [547]:
# Sex - Survived

train_df[["Sex","Survived"]].groupby(["Sex"],as_index=False).mean().sort_values("Survived",ascending=False)

In [548]:
# SibSp - Survived

train_df[["SibSp","Survived"]].groupby(["SibSp"],as_index=False).mean().sort_values("Survived",ascending=False)

In [549]:
# Parch - Survived
train_df[["Parch","Survived"]].groupby(["Parch"],as_index=False).mean().sort_values("Survived",ascending=False)

<a id='7'></a>
# Outlier Detection

In [550]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [551]:
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"])]

In [552]:
# drop outliers
train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

<a id='8'></a>
# Missing Value
* Find Missing Value
* Fill Missing Value

In [553]:
train_df_len = len(train_df)
train_df = pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

In [554]:
train_df.head()

<a id='9'></a>
## Find Missing Value

In [555]:
train_df.columns[train_df.isnull().any()]

In [556]:
train_df.isnull().sum()

<a id='10'></a>
## Fill Missing Value

In [557]:
train_df[train_df["Embarked"].isnull()]

In [558]:
train_df.boxplot(column="Fare",by = "Embarked")
plt.show()

In [559]:
train_df["Embarked"] = train_df["Embarked"].fillna("C")
train_df[train_df["Embarked"].isnull()]

In [560]:
train_df[train_df["Fare"].isnull()]

In [561]:
train_df["Fare"] = train_df["Fare"].fillna(np.mean(train_df[train_df["Pclass"] == 3]["Fare"]))

In [562]:
train_df[train_df["Fare"].isnull()]

<a id='11'></a>
# Visualization

<a id='12'></a>
## Correlation Between Sibsp -- Parch -- Age -- Fare -- Survived

In [563]:
list1 = [ "SibSp","Parch","Age","Fare","Survived"]
sns.heatmap(train_df[list1].corr(),annot=True, fmt=".2f")
plt.show()

Fare feature seems to have correlation with survived feature (0.26).



<a id='13'></a>
## SibSp -- Survived


In [564]:
g = sns.factorplot(x = "SibSp", y = "Survived", data = train_df, kind = "bar", size = 6)
g.set_ylabels("Survived Probability")
plt.show()

* Having a lot of SibSp have less chance to survive.
* if sibsp == 0 or 1 or 2, passenger has more chance to survive
* we can consider a new feature describing these categories.

<a id='14'></a>
## Parch -- Survived


In [565]:
g = sns.factorplot(x="Parch",y = "Survived", data = train_df, kind="bar", size=6)
g.set_ylabels("Survived Probability")
plt.show()

* Sibsp and parch can be used for new feature extraction with th = 3
* small familes have more chance to survive.
* there is a std in survival of passenger with parch = 3

<a id='15'></a>
## Pclass -- Survived


In [566]:
g = sns.factorplot(x = "Pclass", y = "Survived", data = train_df, kind = "bar", size = 6)
g.set_ylabels("Survived Probability")
plt.show()

<a id='16'></a>
## Age -- Survived


In [567]:
g = sns.FacetGrid(train_df,col="Survived")
g.map(sns.distplot,"Age",bins=25)
plt.show()

* age <= 10 has a high survival rate,
* oldest passengers (80) survived,
* large number of 20 years old did not survive,
* most passengers are in 15-35 age range,
* use age feature in training
* use age distribution for missing value of age

<a id='17'></a>
## Pclass -- Survived -- Age

In [568]:
g = sns.FacetGrid(train_df,col="Survived", row="Pclass",size=2)
g.map(plt.hist, "Age", bins = 25)
g.add_legend()
plt.show()

* Pclass is important feature for model training.

<a id='18'></a>
## Embarked -- Sex -- Pclass -- Survived

In [569]:
g = sns.FacetGrid(train_df,row='Embarked',size=2)
g.map(sns.pointplot,"Pclass","Survived","Sex")
g.add_legend()
plt.show()

* embarked and sex features will be used in training.


<a id='19'></a>
## Embarked -- Sex -- Fare -- Survived

In [570]:
g = sns.FacetGrid(train_df,row="Embarked",col="Survived",size=2.3)
g.map(sns.barplot,"Sex","Fare")
g.add_legend()
plt.show()

* Fare feature can be used for categorical variable for training

<a id='20'></a>
## Fill Missing: Age Feature

In [571]:
train_df[train_df["Age"].isnull()]

In [572]:
sns.factorplot(x="Sex",y="Age", hue="Pclass",data=train_df,kind="box")
plt.show()

* First Pclass contains more older passengers than second Pclass.
* Second Pclass contains more older passengers than third Pclass.

In [573]:
sns.factorplot(x="SibSp",y="Age",data=train_df,kind="box")
sns.factorplot(x="Parch",y="Age",data=train_df,kind="box")
plt.show()

In [574]:
sns.heatmap(train_df[["Age","Sex","SibSp","Parch","Pclass"]].corr(),annot=True)
plt.show()

* Age feature is not correlated with Sex feature but Age feature is correlated with Parch, SibSp,Pclass features.

In [575]:
index_nan_age = list(train_df["Age"][train_df["Age"].isnull()].index)
for i in index_nan_age:
    age_pred = train_df["Age"][((train_df["SibSp"] == train_df.iloc[i]["SibSp"]) &(train_df["Parch"] == train_df.iloc[i]["Parch"])& (train_df["Pclass"] == train_df.iloc[i]["Pclass"]))].median()
    age_med = train_df["Age"].mean()
    if not np.isnan(age_pred):
        train_df["Age"].iloc[i] = age_pred
    else:
        train_df["Age"].iloc[i] = age_med

In [576]:
train_df[train_df["Age"].isnull()]

<a id='21'></a>
# Feature Engineering

<a id='22'></a>
## Name-Title

In [577]:
name = train_df["Name"]
train_df["Title"] = [ i.split(".")[0].split(",")[-1].strip() for i in name]
train_df["Title"].head(10)

In [578]:
sns.countplot(x="Title",data=train_df)
plt.xticks(rotation=60)
plt.show()

In [579]:
# convert to categorical
train_df["Title"] = train_df["Title"].replace(["Lady","the Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"other")
train_df["Title"] = [0 if i == "Master" else 1 if i == "Miss" or i == "Ms" or i == "Mlle" or i == "Mrs" else 2 if i == "Mr" else 3 for i in train_df["Title"]]
train_df["Title"].head(20)

In [580]:
sns.countplot(x="Title",data=train_df)
plt.xticks(rotation=90)
plt.show()

In [581]:
g = sns.factorplot(x="Title",y="Survived",data=train_df,kind="bar")
g.set_xticklabels(["Master","Mrs.","Mr.","Others"])
g.set_ylabels("Survival Probability")
plt.show()

In [582]:
train_df.drop(labels=["Name"],axis=1,inplace=True)

In [583]:
train_df.head()

In [584]:
train_df = pd.get_dummies(train_df,columns=["Title"])
train_df.head()

<a id='23'></a>
## Family Size

In [585]:
train_df.head()

In [586]:
train_df["Fsize"] = train_df["SibSp"] + train_df["Parch"] + 1

In [587]:
train_df.head()

In [588]:
g = sns.factorplot(x="Fsize",y="Survived",data = train_df,kind="bar")
g.set_ylabels("Survival")
plt.show()

### 1st Choice
For converting this data into categorial type, three diferent categories are constructed such as

- 1st category includes (Fsize = 1 or Fsize = 7)
- 2nd category includes (Fsize = 2 or Fsize = 3 or Fsize = 4)
- 3rd category includes (Fsize = 5 or Fsize = 6)

In [589]:
train_df["family_size"] = [0 if (i==1 or i==7)  else 1 if (i==2 or i==3 or i==4) else 2  for  i in train_df["Fsize"]]

In [590]:
sns.countplot(x="family_size",data=train_df)
plt.show()

In [591]:
g = sns.factorplot(x="family_size",y="Survived",data=train_df,kind="bar")
g.set_ylabels("Survival")
plt.show()

In [592]:
train_df["family_size"] = train_df["family_size"].astype('float64')

In [593]:
train_df = pd.get_dummies(data=train_df,columns=["family_size"])
train_df.head()

#### 2nd Choice

For converting this data into categorial type, the threshold level can be choiced as 5 as seen as from the figure.


In [594]:
#train_df['family_size'] = [1 if i<5 else 0 for i in train_df["Fsize"]]

In [595]:
 #train_df.head(20)

2nd choice configuration (process for converting family size feature into categorical data with threshold level 5), the distribution can be seen in the graph below.

In [596]:
#sns.countplot(x="family_size",data=train_df)
#plt.show()

In [597]:
#g = sns.factorplot(x="family_size",y="Survived",data= train_df,kind="bar")
#g.set_ylabels("Survival")
#plt.show()

Small families have more change to survive than big families.

In [598]:
#train_df = pd.get_dummies(train_df,columns=["family_size"])
#train_df.head()

<a id='24'></a>
## Embarked

In [599]:
train_df["Embarked"].head()

In [600]:
sns.countplot(x="Embarked",data=train_df)
plt.show()

In [601]:
train_df = pd.get_dummies(data=train_df,columns=["Embarked"])
train_df.head()

<a id='25'></a>
## Ticket

In [602]:
tickets =[]

for i in list(train_df["Ticket"]):
    if not i.isdigit():
        tickets.append(i.replace(".","").replace("/","").strip().split(" ")[0])
    else:
        tickets.append('x')
train_df["Ticket"] = tickets

In [603]:
train_df.head(10)

In [604]:
train_df = pd.get_dummies(data = train_df,columns=["Ticket"],prefix="T")
train_df.head(10)

<a id='26'></a>
## Pclass

In [605]:
sns.countplot(x="Pclass",data=train_df)
plt.show()

In [606]:
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df = pd.get_dummies(train_df,columns=["Pclass"])
train_df.head(10)

<a id='27'></a>
## Sex

In [607]:
train_df["Sex"] = train_df["Sex"].astype("category")
train_df = pd.get_dummies(data = train_df,columns=["Sex"])
train_df.head(10)

<a id='28'></a>
## Cabin

In [608]:
train_df.Cabin = train_df.Cabin.fillna('Unknown')

In [609]:
train_df["Cabin"] = ["Known" if i!= "Unknown" else "Unknown" for i in list(train_df["Cabin"])]

In [610]:
train_df["Cabin"].value_counts()

In [611]:
sns.countplot(x="Cabin",data=train_df)
plt.show()

In [612]:
g = sns.factorplot(x = "Cabin", y="Survived",data = train_df,kind="bar")
g.set_ylabels("Survival")
plt.show()

In [613]:
train_df["Cabin"] = train_df["Cabin"].astype("category")
train_df = pd.get_dummies(data = train_df,columns=["Cabin"],prefix = "C")
train_df.head(10)

<a id='29'></a>
## Drop Passenger ID And Cabin

In [614]:
train_df.drop(labels = ["PassengerId"],axis = 1,inplace=True)

In [615]:
train_df.head()

<a id='30'></a>
# Modeling

In [616]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

<a id='31'></a>
## Train-Test Split

In [617]:
train_df_len

In [618]:
test = train_df[train_df_len:]
test.drop(labels=["Survived"],axis=1,inplace=True)

In [619]:
train = train_df[:train_df_len]
X_train = train.drop(labels = "Survived", axis = 1)
y_train = train["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.33, random_state = 42)
print("X_train",len(X_train))
print("X_test",len(X_test))
print("y_train",len(y_train))
print("y_test",len(y_test))
print("test",len(test))

<a id='32'></a>
## Simple Logistic Regression

In [620]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
acc_log_train = round(logreg.score(X_train, y_train)*100,2) 
acc_log_test = round(logreg.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Testing Accuracy: % {}".format(acc_log_test))

<a id='33'></a>
## Hyperparameter Tuning -- Grid Search -- Cross Validation

I will compare 3 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.

- SVM
- Random Forest
- KNN

In [621]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier()]

dt_param_grid = {"min_samples_split" : range(10,500,20),
                "max_depth": range(1,20,2)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}
classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid]

In [622]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [623]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")

<a id='34'></a>
## Ensemble Modeling

In [624]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                         ("rfc",best_estimators[2]),
                                         ("lr",best_estimators[3])],
                                          voting="soft",n_jobs = -1)
votingC = votingC.fit(X_train, y_train)
print(accuracy_score(votingC.predict(X_test),y_test))

<a id='35'></a>
# Prediction and Submission


In [625]:
test_survived = pd.Series(votingC.predict(test), name = "Survived").astype(int)
df_result = pd.DataFrame({'PassengerId': test_PassengerId,
                   'Survived': test_survived})
df_result.to_csv("submission.csv", index = False)