In [8]:
# import necessary libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder

In [9]:
# load and read dataset
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# display all the columns names
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
# first of all drop all the columns which are not necessary
dataset = dataset.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [12]:
# now display all the datatypes
dataset.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
dtype: object

In [13]:
# unique values in sex column
dataset.Sex.value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [7]:
# since sex is object type we need to convert it into int while applying label encoding
label_encoder = LabelEncoder()

dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [14]:
# now describe dataset
dataset.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,14.526497,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.125,7.9104
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.0
max,1.0,3.0,80.0,512.3292


In [15]:
# find null values if any
dataset.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [16]:
# so there are 177 values in age column which are null

# lets try by calculating mean of that columns
mean = dataset.Age.mean()
mean

29.69911764705882

In [17]:
# now calculate median
median = dataset.Age.median()
median

28.0

In [18]:
# find mode of age column
mode = dataset.Age.mode()
mode

0    24.0
Name: Age, dtype: float64

In [19]:
# value counts
dataset.Age.value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [27]:
# find rows where age is null
null_age = dataset[dataset.Age.isnull()]
null_age

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
5,0,3,male,,8.4583
17,1,2,male,,13.0000
19,1,3,female,,7.2250
26,0,3,male,,7.2250
28,1,3,female,,7.8792
...,...,...,...,...,...
859,0,3,male,,7.2292
863,0,3,female,,69.5500
868,0,3,male,,9.5000
878,0,3,male,,7.8958
