In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

## Discriptive Analysis

In [None]:
train.head()

In [None]:
train.describe()

Observation:
1. Fare is skewed as mean and 50% values are not near to each other. 

In [None]:
train.info()

Observation:
1. Age, Cabin, Embarked have null value. 

## EDA

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
px.histogram(train, x='Survived')

1. 38% survived of the total people onboard survived. 

In [None]:
sns.histplot(train, x = 'Pclass', hue='Survived')

1. It shows large amount of people traveled in 3rd class. 
2. Survival rate of 1st class people is higher than 3rd class people. 

In [None]:
px.histogram(train, x = 'Age', color = 'Survived')

1. Large number of people between age 20 - 35 age travelled.
2. Children and people aged between 20-40 survived more. 

In [None]:
px.histogram(train, x = 'Sex', color = 'Survived')

1. Number of male survived is less than female. 

In [None]:
train[train['Survived'] == 1].groupby('SibSp').count()['Name']/train.groupby('SibSp').count()['Name'].sum()

In [None]:
train[train['Survived'] == 0].groupby('SibSp').count()['Name']/train.groupby('SibSp').count()['Name'].sum()


1. 68% people have no Siblings or wife.
2. 23 % people without siblings or wife survived. 

In [None]:
px.histogram(train, x = 'SibSp', color = 'Survived')

1. Lesser the SibSp value, more the chance of survival. 

In [None]:
train[train['Survived'] == 1].groupby('Parch').count()['Name']/train.groupby('SibSp').count()['Name'].sum()

In [None]:
train[train['Survived'] == 0].groupby('Parch').count()['Name']/train.groupby('SibSp').count()['Name'].sum()

1. 75% have no parents or children and 26% survived in that. 

In [None]:
px.histogram(train, x = 'Parch', color = 'Survived')

1. Lesser the value of Parch, more the chance of survival.

In [None]:
px.histogram(train, x = 'Fare',color = 'Survived')

1. Fare column have exp distribution.  

In [None]:
px.histogram(train, x = 'Embarked',color = 'Survived')

Large number of people embarked on S post. 

#### Summary of the obervation
1. 38% survived of the total people onboard survived.
2. Survival rate of 1st class people is higher than 3rd class people.
3. 68 % people have no Siblings or wife.
4. 23 % people without siblings or wife survived. 
5. 75% have no parents or children and 26% survived in that.
6. Fare price is largely skewed and few people paid fare as high as $512.

## Addressing Missing Value 

In [None]:
train.isna().sum()

In [None]:
train.describe(include=['O'])

1. Cabin is being shared by people. This gives the picture like, a family shares one cabin or alternatively cheap ticket traveller share cabins with other traveller. 

#### Assumptions
1. We can drop Ticket, Cabin, PassengerID cols as they dont directly contribute to Survival rate.Also Cabin col contain lot of null value. 
2. Lets fill the age col with median of the col and Embarked with mode of the col. 

In [None]:
def missing_value(df):
    df = df.drop(['Ticket', 'Cabin','PassengerId','Name'], axis = 1)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    return df

train = missing_value(train)
train

As large number of people are around 20-35 age group. We choose to fill null value of column Age with median.

In [None]:
train.isna().sum()

#### Onehot Encode of Categorical Col

In [None]:
def categorical(df):
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0} ).astype(int)
    df['Embarked'] = df['Embarked'].map({'S':0, 'C':1,'Q':2})
    return df

train =  categorical(train)

## Model building

In [None]:
X_train = train.drop(['Survived'], axis =1)
y_train = train['Survived']

In [None]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(random_state=0, solver = 'liblinear')
model1.fit(X_train, y_train)
model1.score(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest

Thus Random Forest Classifier perform best. 

#### Test set Prediction

In [None]:
test.isna().sum()

In [None]:
test = missing_value(test)
test = categorical(test)
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
test.isna().sum()

In [None]:
Y_pred = random_forest.predict(test)

In [None]:
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('submission.csv', index=False)