# Survival Prediction

Importing Files and packages.

In [None]:
import matplotlib.pyplot as graph
import numpy as np
import pandas as pd
import seaborn as sns

graph.style.use('dark_background')

dataset = pd.read_csv('../input/titanic/train.csv')

dataset.head(6)

In [None]:
def getAxisConfig():
    fig = graph.figure(figsize=(10, 10))
    axis = fig.add_subplot(111)

    return axis

# Feature Engineering

In [None]:
dataset.describe(include='all')

# Handling Missing Values

In [None]:
dataset.info()

Handling Age Feature with Random Sample Imputation.

In [None]:
ax = getAxisConfig()

randomSamples = dataset['Age'].dropna().sample(dataset['Age'].isna().sum())
missingIndices = dataset[dataset['Age'].isna()].index
randomSamples.index = missingIndices
dataset['Age'].plot(kind='kde', ax = ax)
dataset.loc[missingIndices, 'Age'] = randomSamples
dataset['Age'].plot(kind='kde', ax = ax)
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, ['Before Imputation', 'After Imputation'])

In [None]:
dataset.isna().mean() * 100

Handling Emabarked Feature With Mode Imputation.


In [None]:
dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])

In [None]:
dataset.isna().mean() * 100

Handling Cabin Feature And Cabin Has Missing Value Of More Than 77 % So Column Dropped.

In [None]:
dataset = dataset.drop('Cabin', axis=1)

In [None]:
dataset.isna().sum()

In [None]:
dataset.info()

# Checking Correlation Between Feature.

In [None]:
def checkCorrelativity(dataset, threshold):
    arr = []
    for index in dataset.corr().index:
        for value in dataset.corr().loc[index]:
            if ((abs(value * 100) > threshold) and (abs(value * 100) != 100)):
                    arr.append([index, value])
        
    sns.heatmap(dataset.corr(), ax=getAxisConfig(), annot = True)
    return arr

checkCorrelativity(dataset, 85)

# Handling Categorical Features

**Label Encoding.**

In [None]:
print (dataset.info(), '\n \n')

for col in dataset.columns:
    if (dataset[col].dtype == object):
        if (col != 'Name' and col != 'Ticket'):
            print (col, ' : ', dataset[col].unique())
        else:
            print (col, ' : ', len(dataset[col].unique()))

**Label Encoder.**

In [None]:
from sklearn.preprocessing import LabelEncoder

def labelEncoder (dataset, col):
    encoder = LabelEncoder()
    encoder.fit(dataset[col])
    return encoder.transform(dataset[col])

In [None]:
dataset['Sex'] = labelEncoder(dataset, 'Sex')
dataset['Embarked'] = labelEncoder(dataset, 'Embarked')

dataset[['Sex', 'Embarked']].head(10)

In [None]:
dataset.describe(include='all')

**Since Name and Ticket Has Many Category, Dropping The Feature.**

In [None]:
dataset = dataset.drop(['Name', 'Ticket'], axis=1)

In [None]:
# sns.pairplot(dataset, kind='scatter')

In [None]:
dataset = dataset.drop(['PassengerId', "Pclass", "SibSp", "Parch", "Fare"], axis=1)

# Train, Test Split And Model Creation.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
dataset.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,1:], dataset.iloc[:,0], test_size=0.50)

x_train.shape, x_test.shape # 80 % 20 %

In [None]:
regression = LogisticRegression()
regression.fit(x_train, y_train)

In [None]:
y_pred = regression.predict(x_test)

# Cross Validation

In [None]:
def test(dataset, y_test, y_pred):
    axis = getAxisConfig()
    graph.scatter([value for value in range(0, 30)], y_pred[70:100], color='blue')
    graph.scatter([value for value in range(0, 30)], y_test[70:100], color='green')
    graph.legend(['Predicted', 'Actual'])

In [None]:
test(dataset, y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

score = classification_report(y_test, y_pred)

print (score)

confusion = confusion_matrix(y_test, y_pred)

print ('\n Total : ', x_test.shape[0], '\n Truth : ', confusion[0, 0] + confusion[1, 1], '\n Error : ', confusion[0, 1] + confusion[1, 0])

In [None]:
dataset.describe(include='all')

In [None]:
from sklearn.svm import SVC

support_vector_machine = SVC()

support_vector_machine.fit(x_train, y_train)

In [None]:
y_test_pred = support_vector_machine.predict(x_test)

In [None]:
score = classification_report(y_test, y_test_pred)

print (score)

In [None]:
test(dataset, y_test, y_test_pred)

In [None]:
from sklearn import tree

decisionTree = tree.DecisionTreeClassifier()

decisionTree.fit(x_train, y_train)

In [None]:
y_pred = decisionTree.predict(x_test)

In [None]:
score = classification_report(y_test, y_pred)

test(dataset, y_test, y_pred)

print (score)