# Abdullah Ergul's Titanic EDA

This is my first artificial intelligence project, so if you see a bug or find a better way, feel free to let me know.

1. [Load and Check Data](#1)
1. [Veriable Description](#2)
   * [Univariate Variable Analysis](#3)
        * [Categorical Variable Analysis](#4)
        * [Numerical Variable Analysis](#5)
1. [Basic Data Analysis](#6)
1. [Outlier Detection](#7)
1. [Missing Value](#8)
   * [Find Missing Value](#9)
   * [Fill Missing Value](#10)
1. [Visualization](#11)
   * [Correlation Between Values](#12)
       * SibSp -- Survived
       * Parch -- Survived
       * Pclass -- Survived
       * Age -- Survived
       * Pclass -- Survived -- Age
       * Embarked -- Sex -- Pclass -- Survived
       * Embarked -- Sex -- Fare -- Survived
   * [Fill Missing: Age](#13)
1. [Feature Engineering](#14)
    * [Name -- Title](#15)
    * [Family Size](#16)
    * [Embarked](#17)
    * [Ticket](#18)
    * [Pclass](#19)
    * [Sex](#20)
    * [Drop Passenger ID and Cabin](#21)
1. [Modeling](#22)
    * [Train Test Split](#23)
    * [Simple Logistic Regression](#24)
    * [Hyperparameter Tuning -- Grid Search -- Cross Validation](#25)
    * [Ensamble Modeling](#26)
    * [Prediction and Submission](#27)
1. [Could I Survive on the Titanic ?](#28)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
import seaborn as sns
from collections import Counter

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<a id= '1'> </a>
# Load and Check Data

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_PassengerId = test_df["PassengerId"]

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
train_df.describe()

<a id= '2'> </a>
# Veriable Description

Columns:
1. 'PassengerId'  : int
1. 'Survived'     : int
1. 'Pclass'       : int
1. 'Name'         : object
1. 'Sex'          : object
1. 'Age'          : float
1. 'SibSp'        : int
1. 'Parch'        : int
1. 'Ticket'       : object
1. 'Fare'         : float
1. 'Cabin'        : object
1. 'Embarked'     : object

In [None]:
train_df.info()

<a id= '3'> </a>
## Univariate Variable Analysis
   * Categorical Variable: Survived, Sex, Pclass, Embarked, Cabin, Name, Ticket, SibSp and Parch
   * Numerical Variable: PassengerId, Age, Fare

<a id= '4'> </a>
### Categorical Variable Analysis

In [None]:
def bar_plot(variable):
    var = train_df[variable]
    varValue = var.value_counts()
    
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    print("{}: \n {}".format(variable, varValue))

In [None]:
category1 = ["Survived", "Sex", "Pclass", "Embarked", "SibSp", "Parch"]
for c in category1:
    bar_plot(c)

In [None]:
category2 = ["Cabin", "Name", "Ticket"]
for c in category2:
    print("{} \n".format(train_df[c].value_counts()))

<a id= '5'> </a>
### Numerical Variable Analysis

In [None]:
def plot_hist(variable):
    plt.figure(figsize = (9,3))
    plt.hist(train_df[variable], bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distrubution with hist".format(variable))
    plt.show()

In [None]:
numericVar = ["Fare", "Age", "PassengerId"]
for n in numericVar:
    plot_hist(n)

<a id= '6'> </a>
# Basic Data Analysis

In [None]:
# Pclass and Survived
train_df[["Pclass","Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Sex and Survived
train_df[["Sex","Survived"]].groupby(["Sex"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# SibSp and Survived
train_df[["SibSp","Survived"]].groupby(["SibSp"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Parch and Survived
train_df[["Parch","Survived"]].groupby(["Parch"], as_index=False).mean().sort_values(by="Survived", ascending=False)

<a id= '7'> </a>
# Outlier Detection

In [None]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3-Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # Detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v>2)
    
    return multiple_outliers

In [None]:
train_df.loc[detect_outliers(train_df, ["Age", "SibSp", "Parch", "Fare"])]

In [None]:
# drop outliers
train_df = train_df.drop(detect_outliers(train_df, ["Age", "SibSp", "Parch", "Fare"]), axis=0).reset_index(drop=True)

<a id= '8'> </a>
# Missing Value
* Find Missing Value
* Fill Missing Value

In [None]:
train_df_len = len(train_df)
train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

<a id= '9'> </a>
## Find Missing Value

In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
train_df.isnull().sum()

<a id= '10'> </a>
## Fill Missing Value
* Embarked -> 2 missing value
* Fare -> 1 missing value

In [None]:
# Embarked
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df.boxplot(column="Fare", by="Embarked")

In [None]:
train_df["Embarked"] = train_df["Embarked"].fillna("C")
train_df[train_df["Embarked"].isnull()]

In [None]:
# Fare
train_df[train_df["Fare"].isnull()]

In [None]:
np.mean(train_df[train_df['Pclass']==3]['Fare'])

In [None]:
train_df["Fare"] = train_df["Fare"].fillna(np.mean(train_df[train_df['Pclass']==3]['Fare']))

In [None]:
train_df[train_df["Fare"].isnull()]

<a id= '11'> </a>
# Visualization

<a id= '12'> </a>
## Correlation Between Values

In [None]:
list1 = ["SibSp", "Parch", "Age", "Fare", "Survived"]
sns.heatmap(train_df[list1].corr(), annot=True, fmt=".2f")
plt.show()

### SibSp -- Survived

In [None]:
g = sns.catplot(x="SibSp", y="Survived",data=train_df, kind="bar", height=6)
g.set_ylabels("Survived Prob")
plt.show()

* Having a lot of SibSp have less chance to survive.
* If SibSp == 0 or 1 or 2, passenger has more change to survive
* We can consider a new feature describing there categories

### Parch -- Survived

In [None]:
g = sns.catplot(x="Parch", y="Survived",data=train_df, kind="bar", height=6)
g.set_ylabels("Survived Prob")
plt.show()

* SibSp and Parch can be used for new feature extraction with th = 3
* Small families have more change to survive.
* There is a std in survival of passenger with Parch = 3

### Pclass -- Survived

In [None]:
g = sns.catplot(x="Pclass", y="Survived",data=train_df, kind="bar", height=6)
g.set_ylabels("Survived Prob")
plt.show()

### Age -- Survived

In [None]:
g = sns.FacetGrid(train_df, col="Survived")
g.map(sns.distplot, "Age", bins=25)
plt.show()

* Age <= 10 has a high survival rate
* Oldest passengers (>80) survived
* Large number of 20 years old didn't survive
* Most passengers are in 15-35 age range
* Use age feature in training
* Use age distrubition for missing value of age

### Pclass -- Survived -- Age

In [None]:
g = sns.FacetGrid(train_df, col="Survived", row="Pclass")
g.map(plt.hist, "Age", bins=25)
g.add_legend()
plt.show()

* Pclass is important feature for model training

### Embarked -- Sex -- Pclass -- Survived

In [None]:
g = sns.FacetGrid(train_df, row="Embarked")
g.map(sns.pointplot, "Pclass", "Survived", "Sex")
g.add_legend()
plt.show()

* Female passengers have much higher survival rate than male
* Embarked and Sex will be used in training

### Embarked -- Sex -- Fare -- Survived

In [None]:
g = sns.FacetGrid(train_df, row="Embarked", col="Survived")
g.map(sns.barplot, "Sex", "Fare")
g.add_legend()
plt.show()

* Passengers who pay higher fare have more survived
* Fare can be used as categorical for model training

<a id= '13'> </a>
## Fill Missing: Age

In [None]:
train_df[train_df["Age"].isnull()]

In [None]:
sns.catplot(x="Sex", y="Age", data=train_df, kind="box")
plt.show()

* Sex is not informative for age prediction

In [None]:
sns.catplot(x="Sex", y="Age", hue="Pclass", data=train_df, kind="box")
plt.show()

* 1st class passengers older than 2nd, and 2nd is older than 3rd

In [None]:
sns.catplot(x="Parch", y="Age", data=train_df, kind="box")
sns.catplot(x="SibSp", y="Age", data=train_df, kind="box")
plt.show()

In [None]:
train_df["Sex"] = [1 if i=="male" else 0 for i in train_df["Sex"]]
sns.heatmap(train_df[["Age","Sex","SibSp","Parch","Pclass"]].corr(), annot=True)
plt.show()

* Age is not coralated with sex but it is corralated with Parch, SibSp and Pclass

In [None]:
index_nan_age = list(train_df["Age"][train_df["Age"].isnull()].index)
for i in index_nan_age:
    age_pred = train_df["Age"][((train_df["SibSp"] == train_df.iloc[i]["SibSp"]) & (train_df["Parch"] == train_df.iloc[i]["Parch"]) & (train_df["Pclass"] == train_df.iloc[i]["Pclass"]))].median()
    age_med = train_df["Age"].median()
    if not np.isnan(age_pred):
        train_df["Age"].iloc[i] = age_pred
    else:
        train_df["Age"].iloc[i] = age_med

In [None]:
train_df[train_df["Age"].isnull()]

<a id= '14'> </a>
# Feature Engineering

<a id= '15'> </a>
## Name -- Title

In [None]:
train_df["Name"].head(10)

In [None]:
s = "Braund, Mr. Owen Harris"
s.split(".")[0].split(",")[-1].strip()

In [None]:
name = train_df["Name"]
train_df["Title"] = [i.split(".")[0].split(",")[-1].strip() for i in name]

In [None]:
train_df["Title"].head(10)

In [None]:
sns.countplot(x="Title", data=train_df)
plt.xticks(rotation=60)
plt.show()

In [None]:
#convert to categorical
train_df["Title"] = train_df["Title"].replace(["the Countess", "Capt", "Col", "Don", "Dr", "Major","Rev","Sir", "Jonkheer", "Dona"], "other")
train_df["Title"] = [0 if i == "Master" else 1 if i == "Miss" or i == "Ms" or i=="Mlle" or i=="Mrs" else 2 if i=="Mr" else 3 for i in train_df["Title"]]

In [None]:
sns.countplot(x="Title", data=train_df)
plt.xticks(rotation=60)
plt.show()

In [None]:
g = sns.catplot(x="Title", y="Survived", data=train_df, kind="bar")
g.set_xticklabels(["Master","Miss-Mrs","Mr","Other"])
g.set_ylabels("Survival Prob")
plt.show()

In [None]:
train_df.drop(labels=["Name"],axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
train_df = pd.get_dummies(train_df, columns=["Title"])
train_df.head()

<a id= '16'> </a>
## Family Size

In [None]:
train_df["Fsize"] = train_df["SibSp"] + train_df["Parch"] + 1

In [None]:
g = sns.catplot(x="Fsize", y="Survived", data=train_df, kind="bar")
g.set_ylabels("Survival")
plt.show()

In [None]:
train_df["family_size"] = [1 if i<5 else 0 for i in train_df["Fsize"]]

In [None]:
sns.countplot(x="family_size", data=train_df)
plt.show()

In [None]:
g = sns.catplot(x="family_size", y="Survived", data=train_df, kind="bar")
g.set_ylabels("Survival")
plt.show()

* Small families have more change to survive than large families

In [None]:
#convert to categorical
train_df = pd.get_dummies(train_df, columns=["family_size"])
train_df.head()

<a id= '17'> </a>
## Embarked

In [None]:
sns.countplot(x="Embarked", data=train_df)
plt.show()

In [None]:
train_df = pd.get_dummies(train_df, columns=["Embarked"])

In [None]:
train_df.head()

<a id= '18'> </a>
## Ticket

In [None]:
train_df["Ticket"].head(20)

In [None]:
tickets = []
for i in list(train_df.Ticket):
    if not i.isdigit():
        tickets.append(i.replace(".", "").replace("/","").strip().split(" ")[0])
    else:
        tickets.append("x")
    
train_df["Ticket"] = tickets

In [None]:
train_df["Ticket"].head(20)

In [None]:
#convert to categorical
train_df = pd.get_dummies(train_df, columns=["Ticket"], prefix="T")
train_df.head(10)

<a id= '19'> </a>
## Pclass

In [None]:
sns.countplot(x="Pclass", data=train_df)
plt.show()

In [None]:
#convert to categorical
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df = pd.get_dummies(train_df, columns=["Pclass"])
train_df.head()

<a id= '20'> </a>
## Sex

In [None]:
train_df["Sex"] = train_df["Sex"].astype("category")
train_df = pd.get_dummies(train_df, columns=["Sex"])
train_df.head()

<a id= '21'> </a>
## Drop Passenger ID and Cabin

In [None]:
train_df.drop(labels=["PassengerId","Cabin"], axis=1, inplace=True)
train_df.columns

<a id= '22'> </a>
# Modeling

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

<a id= '23'> </a>
## Train Test Split

In [None]:
train_df_len

In [None]:
test = train_df[train_df_len:]
test.drop(labels=["Survived"], axis=1, inplace=True)
test.head()

In [None]:
train = train_df[:train_df_len]
x_train = train.drop(labels="Survived", axis=1)
y_train = train["Survived"]
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42)
print("x_train: ",len(x_train))
print("x_test: ",len(x_test))
print("y_train: ",len(y_train))
print("y_test: ",len(y_test))
print("test: ",len(test))

<a id= '24'> </a>
## Simple Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
acc_log_train = round(logreg.score(x_train, y_train)*100,2)
acc_log_test = round(logreg.score(x_test, y_test)*100,2)
print("Training Accuracy : % {}".format(acc_log_train))
print("Testting Accuracy : % {}".format(acc_log_test))

<a id= '25'> </a>
## Hyperparameter Tuning -- Grid Search -- Cross Validation
We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.

* Decision Tree
* SVM
* Random Forest
* KNN
* Logistic Regression

In [None]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state=random_state),
             SVC(random_state=random_state),
             RandomForestClassifier(random_state=random_state),
              LogisticRegression(random_state=random_state),
              KNeighborsClassifier()
             ]

dt_param_grid = {"min_samples_split" : range(10,500,20),
                "max_depth": range(1,20,2)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}
classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid]

In [None]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(x_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})

g = sns.barplot(x="Cross Validation Means", y="ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")
plt.show()

<a id= '26'> </a>
## Ensamble Modeling

In [None]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                        ("rfc",best_estimators[2]),
                                        ("lr",best_estimators[3])],
                                        voting = "soft", n_jobs = -1)
votingC = votingC.fit(x_train, y_train)
print(accuracy_score(votingC.predict(x_test),y_test))

<a id= '27'> </a>
## Prediction and Submission

In [None]:
test_survived = pd.Series(votingC.predict(test), name = "Survived").astype(int)
results = pd.concat([test_PassengerId, test_survived],axis = 1)
results.to_csv("titanic.csv", index = False)

<a id= '28'> </a>
# Could I Survive on the Titanic ?

In [None]:
Ergul_Family = {
         'Age':[55,55,23,21],
         'SibSp':[1,1,1,1],
         'Parch':[2,2,2,2],
         'Fare':[28,0,0,0],
         'Title_0':[0,0,0,0],
         'Title_1':[0,1,1,0],
         'Title_2':[1,0,0,1],
         'Title_3':[0,0,0,0],
         'Fsize':[4,4,4,4],
         'family_size_0':[0,0,0,0],
         'family_size_1':[1,1,1,1],
         'Embarked_C':[0,0,0,0],
         'Embarked_Q':[0,0,0,0],
         'Embarked_S':[1,1,1,1],
         'T_A':[0,0,0,0],
         'T_A4':[0,0,0,0], 
         'T_A5':[0,0,0,0],
         'T_AQ3':[0,0,0,0],
         'T_AQ4':[0,0,0,0],
         'T_AS':[0,0,0,0],
         'T_C':[0,0,0,0],
         'T_CA':[0,0,0,0],
         'T_CASOTON':[0,0,0,0],
         'T_FC':[0,0,0,0],
         'T_FCC':[0,0,0,0],
         'T_Fa':[0,0,0,0],
         'T_LINE':[0,0,0,0],
         'T_LP':[0,0,0,0],
         'T_PC':[0,0,0,0],
         'T_PP':[0,0,0,0],
         'T_PPP':[0,0,0,0],
         'T_SC':[0,0,0,0],
         'T_SCA3':[0,0,0,0],
         'T_SCA4':[0,0,0,0],
         'T_SCAH':[0,0,0,0],
         'T_SCOW':[0,0,0,0],
         'T_SCPARIS':[0,0,0,0],
         'T_SCParis':[0,0,0,0],
         'T_SOC':[0,0,0,0],
         'T_SOP':[0,0,0,0],
         'T_SOPP':[0,0,0,0],
         'T_SOTONO2':[0,0,0,0],
         'T_SOTONOQ':[0,0,0,0],
         'T_SP':[0,0,0,0],
         'T_STONO':[1,1,1,1],
         'T_STONO2':[0,0,0,0],
         'T_STONOQ':[0,0,0,0],
         'T_SWPP':[0,0,0,0],
         'T_WC':[0,0,0,0],
         'T_WEP':[0,0,0,0],
         'T_x':[0,0,0,0],
         'Pclass_1':[0,0,0,0],
         'Pclass_2':[0,0,0,0],
         'Pclass_3':[1,1,1,1],
         'Sex_0':[0,1,1,0],
         'Sex_1':[1,0,0,1]
        }
Ergul_Family = pd.DataFrame(Ergul_Family)

Ergul_Family_Name = ["Adem", "Munevver", "Zulal", "Abdullah"]
Ergul_Family_Name = pd.Series(Ergul_Family_Name, name="Name")

In [None]:
ergul_test_survived = pd.Series(votingC.predict(Ergul_Family), name = "Survived").astype(int)
ergul_results = pd.concat([Ergul_Family_Name, ergul_test_survived],axis = 1)
ergul_results

No, I can't :(