In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Step 1 : Variable Identification

In [None]:
eda = pd.read_csv('../input/titanic-dataset/test.csv')

In [None]:
eda.head()

<b>Dependant Variable</b> : Age

<b>Independant Variables</b> : 
1. PassengerId
2. Survived
3. Pclass
4. Name
5. Sex
6. SibSp
7. Parch
8. Ticket
9. Fare
10. Cabin
11. Embarked

<b>Type of Variable:</b>

A. <b>Categorical</b>- Sex, Embarked

B. <b>Numerical</b> - PassengerId, Survived, SibSp, Parch, Fare, Pclass

<b>Data Types</b>:

A. <b>Strings</b>- Name, Sex, Ticket, Cabin, Embarked

B. <b>Numerical</b>- PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare

<b>Variable Category</b>:

A. <b>Categorical</b>- Sex, Embarked

B. <b>Continuous</b>- PassengerId, Age, SibSp, Parch, Fare

C. <b>Dicrete</b>- Survived, Pclass

In [None]:
eda.info()

In [None]:
eda.describe(include='all')

Here, for Age, mean>median. Hence it is right-skewed

# Step 2 : Univariate Analysis

## Categorical Variables

In [None]:
sns.countplot(x = 'Sex', data = eda)

<b>Observation - More number of males compared to females</b>

In [None]:
sns.countplot(x='Embarked',data=eda)

<b>Observation- More number of people embarked from the port of Southampton followed by Cherbourg and Queenstown</b>

## Continuous Variables

In [None]:
sns.boxplot(y='PassengerId',data=eda)

<b>No Outliers detected in PassengerId</b>

In [None]:
sns.boxplot(y='Age',data=eda)

<b>There are some outliers in Age, hence we plot the distplot to view the outliers in data</b>

In [None]:
sns.distplot(eda['Age'],bins=10)

In [None]:
sns.boxplot(y='SibSp',data=eda)

<b>There are outliers in SibSp. Hence we plot distplot to view the outliers</b>

In [None]:
sns.distplot(eda['SibSp'])

In [None]:
sns.boxplot(y= 'Fare',data=eda)

<b>There are outliers which have been detected in Fare, hence distplot is used to view outliers

In [None]:
sns.distplot(eda['Fare'],bins=5)

# Step 3 : Bivariate Analysis

## Continuous and Continuous Variables

Continuous- PassengerId, Age, SibSp, Parch, Fare

In [None]:
plt.figure(figsize=(15,6))
plt.scatter(eda['PassengerId'],eda['Age'])
plt.title('PassengerId VS Age')
plt.xlabel('PassengerId')
plt.ylabel('Age')

In [None]:
plt.figure(figsize=(15,6))
plt.scatter(eda['PassengerId'],eda['SibSp'])
plt.title('PassengerId VS SibSp')
plt.xlabel('PassengerId')
plt.ylabel('SibSp')

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(eda['PassengerId'],eda['Fare'])
plt.title('PassengerId VS Fare')
plt.xlabel('PassengerId')
plt.ylabel('Fare')

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(eda['Age'],eda['SibSp'])
plt.title('Age VS SibSp')
plt.xlabel('Age')
plt.ylabel('SibSp')

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(eda['Age'],eda['Fare'])
plt.title('Age VS Fare')
plt.xlabel('Age')
plt.ylabel('Fare')

In [None]:
sns.heatmap(eda.corr(),annot=True,linewidth=0.5)

In [None]:
eda_cont=eda.iloc[:,:-3]
eda_cont
sns.pairplot(eda_cont)

## Categorical and Categorical Variables

Categorical- Sex, Embarked

In [None]:
A = eda.groupby(['Sex','Embarked'],axis = 0)
A.size()

In [None]:
pd.crosstab(eda['Sex'],eda['Embarked']).plot(kind='bar',stacked=True)

# Step 4 : Missing Values Treatment

In [None]:
eda.info()

In [None]:
eda.describe()

In [None]:
eda.isnull().sum()

In [None]:
eda['Age'].describe()

In [None]:
eda.hist(column=['Age'],bins=5)

In [None]:
eda['Age']=eda['Age'].fillna(value=eda['Age'].median())

In [None]:
mode = eda['Cabin'].mode().values[0]
eda['Cabin'].fillna(value=mode,inplace=True)

In [None]:
mode1 = eda['Embarked'].mode().values[0]
eda['Embarked'].fillna(value=mode1,inplace=True)

In [None]:
eda

Here, there are missing values in the columns- <b>Age, Cabin and Embarked</b>
The missing values in <b>Age</b> are filled with the median values.
The missing values in <b>Cabin</b> and <b>Embarked</b> are filled with the mode values.

In [None]:
eda.info()

In [None]:
eda

# Step 5 : Outliers

In [None]:
eda.boxplot(column=['Age'])

In [None]:
eda.boxplot(column=['Fare'])

In [None]:
plt.scatter(eda['Age'],eda['Fare'])

In [None]:
eda['Age'].describe()

In [None]:
IQR=eda['Age'].quantile(0.75)-eda['Age'].quantile(0.25)
print(IQR)

In [None]:
Upper_Outlier_Limit = eda['Age'].quantile(0.75) + 1.5*IQR
Upper_Outlier_Limit

In [None]:
Lower_Outlier_Limit = eda['Age'].quantile(0.25) - 1.5*IQR
Lower_Outlier_Limit

In [None]:
Outlier_Values = eda[(eda['Age']>=Upper_Outlier_Limit)|(eda['Age']<=Lower_Outlier_Limit)]

In [None]:
Outlier_Values

In [None]:
Upper_Outlier_Limit1 = eda['Fare'].quantile(0.75) + 1.5*IQR
Upper_Outlier_Limit1

In [None]:
Lower_Outlier_Limit1 = eda['Fare'].quantile(0.25) - 1.5*IQR
Lower_Outlier_Limit1

In [None]:
Outlier_Values1 = eda[(eda['Fare']>=Upper_Outlier_Limit1)|(eda['Fare']<=Lower_Outlier_Limit1)]

In [None]:
Outlier_Values1

In [None]:
eda.drop('Cabin',axis=1,inplace=True)

In [None]:
eda.drop('Ticket',axis=1,inplace=True)

In [None]:
eda.drop('Pclass',axis =1,inplace =True)
eda.drop('Name',axis=1,inplace=True)
eda.drop('Sex',axis=1,inplace =True)
eda.drop('SibSp',axis=1,inplace=True)
eda.drop('Parch',axis=1,inplace=True)
eda.drop('Fare',axis=1,inplace =True)
eda.drop('Embarked',axis=1,inplace =True)

In this case, we get outliers for <b>Fare</b> and <b>Age</b>. However, these outliers can be ignored since they do not add value to the data.
Also, we drop the columns-<b>Cabin</b> and <b>Ticket</b> since they do not add value to the data.

# Step 6 : Feature Engineering-Variable and Dummy Variable Creation

In [None]:
obj=eda.dtypes==np.object
print(obj)

In [None]:
obj=eda.dtypes==np.object
print(obj)

In [None]:
eda.columns[obj]

In [None]:
eda = pd.get_dummies(eda,drop_first=True)

In [None]:
eda

# Building Classification Models

## Define X and Y

In [None]:
eda.head()

In [None]:
cols=eda.columns
cols=['PassengerId','Age']

In [None]:
eda = eda[cols]
eda

In [None]:
X = eda.iloc[:,:-1].values
Y = eda.iloc[:,-1].values

In [None]:
X

In [None]:
Y

In [None]:
X.shape

In [None]:
Y.shape

## Splitting the data in Training and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape