# İNTRODUCTİON 
#### The sinking of the ocean liner RMS Titanic on April 15, 1912 was one of the most dramatic events of the twentieth century. In a mere four hours after striking an iceberg, the largest passenger ship yet built sank while on its maiden voyage, claiming the lives of over 1,500 persons.



<font color='blue'/>

Content:
    
1. [Load and Check Data](#1)    
1. [Variable Description](#2)
 *  [Univariate Description ](#3)
    *         [Categorical Description ](#4)
    *         [Numerical Description ](#5)

1. [Basic Data Analysis](#6)
1. [Outlier Detection](#7)
1. [Missing Value](#8)
  *   [Find Missing Value](#9)
   *  [Fill Missing Value](#10)
1. [Visualization](#11)   
    * [Correlation Between SibSp-Parch-Fare-Age-Survived](#12)
    * [SibSp - Survived](#13)
    * [Parch - Survived](#14)
    * [Pclass - Survived](#15)
    * [Age - Survived](#16)
    * [Pclass - Survived - Age](#17)
    * [Embarked - Sex - Pclass - Survived](#18)
    * [Embarked - Sex - Fare - Survived](#19)
    * [Fill Missing Value : Age Feature](#20)
1. [Feature Engineering](#21)
    * [Name -- Title](#22)

    
    
    
   
    

    

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

import seaborn as sns

import warnings
warnings.filterwarnings('ignore') 

from collections import Counter


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1" ></a><br>
# Load and Check Data

In [None]:
train_df=pd.read_csv("/kaggle/input/titanic/train.csv")
test_df=pd.read_csv("/kaggle/input/titanic/test.csv")
test_Passengerİd=test_df["PassengerId"]

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
train_df.describe()

<a id="2" ></a><br>
# Variable Description


1. PassengerId : unique id number to each passenger
1. Survived : passenger survive(1) or died(0)
1. Pclass : passenger class
1. Name : name of passenger
1. Sex : gender of passenger
1. Age : age of passenger
1. SibSp : number of siblings/spouses
1. Parch : number of parent/children
1. Ticket : ticket number
1. Fare : amount of money spent on ticket
1. Cabin : cabin category
1. Embarked : port where passenger ambarked (C = Cherbourg , Q = Queenstown , S = Southampton)


In [None]:
train_df.info()

* **float64(2) :Fare and Age**
* **int64(5): Pclass , Parch, SibSp , Passengerid and Survived**
* **object(5): Ticket , Cabin, Name , Embarked and Sex**o

<a id="3" ></a><br>
#  Univariate Description Analysis
*         Categorical Description : Pclass , Parch , Survived , Embarked , Sex , Ticked , Cabin , Name , Sibsp 
*         Numerical Description : Age , Passenger , Fare

<a id="4" ></a><br>
## Categorical Description

In [None]:
def bar_plot(variable):
    """
        input: variable ex: "sex"
        output: bar plot & value_count
        
    """
    # get feature
    var= train_df[variable]
    
    # count number of categorical variable[value/sample]
    varValue=var.value_counts()
    
    #visualize
    plt.figure(figsize=(9,3))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index,varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}:\n {}".format(variable,varValue))
    

In [None]:
category=["Pclass" , "Parch" , "Survived" , "Embarked" , "Sex"   , "SibSp"]
for c in category:
    bar_plot(c)

<a id="4" ></a><br>
## Numerical Description  

In [None]:
def hist_plot(variable):
    plt.figure(figsize=(9,3))
    plt.hist(train_df[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist ".format(variable))
    plt.show()

In [None]:
numericvar=["Fare","Age"]
for i in numericvar:
    hist_plot(i)

<a id="6" ></a><br>
# Basic Data Analysis
* Pclass - Survived
* Sex - Survived
* Sibsp - Survived
* Parch - Survived


In [None]:
train_df[["Pclass","Survived"]].groupby(["Pclass"],as_index=False).mean().sort_values(by="Survived",ascending=False)

In [None]:
train_df[["Sex","Survived"]].groupby(["Sex"],as_index=False).mean().sort_values(by="Survived",ascending=False)

In [None]:
train_df[["SibSp","Survived"]].groupby(["SibSp"],as_index=False).mean().sort_values(by="Survived",ascending=False)

In [None]:
train_df[["Parch","Survived"]].groupby(["Parch"],as_index=False).mean().sort_values(by="Survived",ascending=False)

<a id="7" ></a><br>
# Outlier Detection

In [None]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"])]

In [None]:
train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

<a id="8" ></a><br>
# Missing Value
  *  Find Missing Value
  *  Fill Missing Value

In [None]:
train_df_len=len(train_df)
train_df=pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

<a id="9" ></a><br>

### Find Missing Value


In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
train_df.isnull().sum()

<a id="10" ></a><br>

### Fill Missing Value
* Embarked has 2 missing value
*     Fare has only one

In [None]:
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df.boxplot(column="Fare",by="Embarked")
plt.show()

In [None]:
train_df["Embarked"]=train_df["Embarked"].fillna("C")
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df[train_df["Fare"].isnull()]

In [None]:
train_df["Pclass"]=train_df["Pclass"].fillna(np.mean(train_df[train_df["Pclass"]==3]["Fare"]))

In [None]:
train_df[train_df["Pclass"].isnull()]

<a id="11" ></a><br>
# Visualization

<a id="12" ></a><br>
## Correlation Between SibSp-Parch-Fare-Age-Survived

In [None]:
sns.heatmap(train_df[['SibSp','Parch','Fare','Age','Survived']].corr(),annot=True,fmt='.2f')
plt.show()

<a id="13" ></a><br>
## SibSp - Survived

In [None]:
g=sns.factorplot(x='SibSp',y='Survived',data=train_df,kind='bar',size=7)
g.set_ylabels('Survived Probability')
plt.show()

<a id="14" ></a><br>
## Parch - Survived

In [None]:
g=sns.factorplot(x='Parch',y='Survived',data=train_df,kind='bar',size=7)
g.set_ylabels('Survived Probability')
plt.show()

<a id="15" ></a><br>
## Pclass - Survived

In [None]:
g=sns.factorplot(x='Pclass',y='Survived',data=train_df,kind='bar',size=7)
g.set_ylabels('Survived Probability')
plt.show()

<a id="16" ></a><br>
## Age - Survived

In [None]:
g=sns.FacetGrid(train_df,col='Survived')
g.map(sns.distplot,'Age',bins=25)
plt.show()

<a id="17" ></a><br>
## Pclass - Survived - Age

In [None]:
g=sns.FacetGrid(train_df,col='Survived',row='Pclass',size=5)
g.map(sns.histplot,'Age',bins=25)
g.add_legend()
plt.show()

<a id="18" ></a><br>
## Embarked - Sex - Pclass - Survived

In [None]:
g=sns.FacetGrid(train_df,col='Embarked',size=4)
g.map(sns.pointplot,'Pclass','Survived','Sex')
g.add_legend()
plt.show()

<a id="19" ></a><br>
## Embarked - Sex - Fare - Survived

In [None]:
g=sns.FacetGrid(train_df,row='Embarked',col='Survived',size=4)
g.map(sns.barplot,'Sex','Fare')
g.add_legend()
plt.show()

<a id="20" ></a><br>
## Fill Missing Value : Age Feature

In [None]:
sns.factorplot(x='Sex',y='Age',hue='Pclass',data=train_df,kind='box',size=7)
plt.show()

In [None]:
sns.factorplot(x='Parch',y='Age',data=train_df,kind='box',size=7)
sns.factorplot(x='SibSp',y='Age',data=train_df,kind='box',size=7)
plt.show()

In [None]:
train_df["Sex"]=[1 if i=="male" else 0 for i in train_df["Sex"]]

In [None]:
sns.heatmap(train_df[["Age","Sex","SibSp","Parch","Pclass"]].corr(),annot=True)
plt.show()

In [None]:
index_nan_age = list(train_df["Age"][train_df["Age"].isnull()].index)
for i in index_nan_age:
    age_pred = train_df["Age"][((train_df["SibSp"] == train_df.iloc[i]["SibSp"]) &(train_df["Parch"] == train_df.iloc[i]["Parch"])& (train_df["Pclass"] == train_df.iloc[i]["Pclass"]))].median()
    age_med = train_df["Age"].median()
    if not np.isnan(age_pred):
        train_df["Age"].iloc[i] = age_pred
    else:
        train_df["Age"].iloc[i] = age_med

In [None]:
train_df[train_df["Age"].isnull()]

<a id="21"></a><br>
# Features Engeering

<a id="22"></a><br>
## Name -- Title 

In [None]:
train_df["Name"].head(10)

In [None]:
name=train_df["Name"]
train_df["Title"]=[i.split(".")[0].split(",")[-1].strip() for i in name]

In [None]:
train_df["Title"].head(10)

In [None]:
sns.countplot(x="Title",data=train_df)
plt.xticks(rotation=60)
plt.show()

In [None]:
# convert to categorical
train_df["Title"] = train_df["Title"].replace(["Lady","the Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"other")
train_df["Title"] = [0 if i == "Master" else 1 if i == "Miss" or i == "Ms" or i == "Mlle" or i == "Mrs" else 2 if i == "Mr" else 3 for i in train_df["Title"]]
train_df["Title"].head(20)

In [None]:
sns.countplot(x="Title",data=train_df)
plt.xticks(rotation=60)
plt.show()

In [None]:
g=sns.factorplot(x="Title",y="Survived",data=train_df,kind="bar")
g.set_xticklabels(["Master","Mrs","Mr","Other"])
g.set_ylabels("Survival Probability")
plt.show()

In [None]:
train_df.drop(labels=["Name"],axis=1,inplace=True)

In [None]:
train_df.head()

In [None]:
train_df=pd.get_dummies(train_df,columns=["Title"])
train_df.head()