In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd 
import matplotlib as plt

In [2]:
#file locations
train_fil_loc = "D:/work data/titanic-machine learning for disaster/train.csv"
test_file_loc = "D:/work data/titanic-machine learning for disaster/test.csv"

In [3]:
#reading training data 
train_data = pd.read_csv(train_fil_loc)
test_data = pd.read_csv(test_file_loc)
full_data = [train_data,test_data]                   #creating complete dataset to  convert them into desired format

In [4]:
#general info of the data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
#let's work with the categorical values first
#checking for patterns in titles
import re

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

print(pd.crosstab(train_data['Title'], train_data['Sex']))

Sex       female  male
Title                 
Capt           0     1
Col            0     2
Countess       1     0
Don            0     1
Dr             1     6
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    40
Miss         182     0
Mlle           2     0
Mme            1     0
Mr             0   517
Mrs          125     0
Ms             1     0
Rev            0     6
Sir            0     1


In [6]:
#here we have few noise
#mispelled words and other things
#we will be renaming them

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print (train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


In [7]:
#lets take a look at the pclass and the survival rate
print (train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
       #.mean() calculates the mean of value of the attribute

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [8]:
#survival rate based on gender
print(train_data[['Sex','Survived']].groupby(['Sex'],as_index = False).mean())

      Sex  Survived
0  female  0.742038
1    male  0.188908


In [9]:
#looking at the age 
print('Unique Ages are : {}'.format(len(train_data['Age'].unique())))
print('Missing values of Age in training data: {}'.format(train_data['Age'].isna().sum()))

Unique Ages are : 89
Missing values of Age in training data: 177


In [34]:
#replacinig missing values by the mean age
#using imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0)
imputer.fit(train_data.iloc[:, 5:6])
train_data['imputer_age'] = imputer.transform(train_data.iloc[:, 5:6])

In [35]:
#there are 89 unique values so am just grouping them
print('min value of age: {}'.format(min(train_data["Age"])))
print('max value of age: {}'.format(max(train_data['Age'])))

min value of age: 0.42
max value of age: 80.0


In [36]:
#lets group the age into 5 catgories
#0-16
#17-32
#33-48
#48-80

type(train_data['Age'])

pandas.core.series.Series

In [82]:
#converting series of Age into 5 groups
#pd.cut does uneven grouping
train_data['Cat_age'] = pd.cut(train_data['imputer_age'],5)

In [83]:
print(train_data['Cat_age'][:5])

0    (16.336, 32.252]
1    (32.252, 48.168]
2    (16.336, 32.252]
3    (32.252, 48.168]
4    (32.252, 48.168]
Name: Cat_age, dtype: category
Categories (5, interval[float64]): [(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]


In [84]:
#looking at the categorized age and their mean of the survival rate
print(train_data[['Cat_age','Survived']].groupby(['Cat_age'],as_index = False).mean())

            Cat_age  Survived
0    (0.34, 16.336]  0.550000
1  (16.336, 32.252]  0.344168
2  (32.252, 48.168]  0.404255
3  (48.168, 64.084]  0.434783
4    (64.084, 80.0]  0.090909


In [40]:
#lets take a look at the fare 
print('Unique prices of fare are: {}'.format(len(train_data['Fare'].unique())))

Unique prices of fare are: 248


In [41]:
#lets categorize fare into intervals too
print('min fare: {}'.format(min(train_data['Fare'])))
print('max fare: {}'.format(max(train_data['Fare'])))

min fare: 0.0
max fare: 512.3292


In [42]:
#categorizing the fare in 4 equal  parts using pd.qcut
train_data['Cat_fare'] = pd.qcut(train_data['Fare'],4)
print(train_data[['Cat_fare','Survived']].groupby(['Cat_fare'], as_index = False).mean())

          Cat_fare  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


In [53]:
#checking the missing values
print('Missing values are: {}'.format(train_data['Embarked'].isna().sum()))
type(train_data['Embarked'][0])
print('Unique values are: {}, which are: {}'.format(len(train_data['Embarked'].unique()),train_data["Embarked"].unique()))

Missing values are: 2
Unique values are: 4, which are: ['S' 'C' 'Q' nan]


In [55]:
#checing the values counts  of Embarked
print(train_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [57]:
#most of the data is 'S', so we can replace 'nan' with 'S'
train_data['Embarked'] = train_data['Embarked'].fillna('S')
print('Now values counts are: {}'.format(train_data['Embarked'].value_counts()))

Now values counts are: S    646
C    168
Q     77
Name: Embarked, dtype: int64


In [64]:
#checking embarked against survival rate
print(train_data[['Embarked','Survived']].groupby(['Embarked'],as_index = False).mean())

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


In [58]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'mean_Age',
       'imputer_age', 'Cat_age', 'Cat_fare'],
      dtype='object')

In [66]:
#let's take a look at SibSp
print('Total unique values are: {}'.format(len(train_data['SibSp'].unique())))
print('which are: {}'.format(train_data['SibSp'].unique()))
print('Missing values are: {}'.format(train_data['SibSp'].isna().sum()))

Total unique values are: 7
which are: [1 0 3 4 2 5 8]
Missing values are: 0


In [68]:
#let's check it against survival rate
print(train_data[['SibSp','Survived']].groupby(['SibSp'], as_index = False).mean())

   SibSp  Survived
0      0  0.345395
1      1  0.535885
2      2  0.464286
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000


In [69]:
#let's take a look at Parch
print('Total unique values are: {}'.format(len(train_data['Parch'].unique())))
print('which are: {}'.format(train_data['Parch'].unique()))
print('Missing values are: {}'.format(train_data['Parch'].isna().sum()))

Total unique values are: 7
which are: [0 1 2 5 3 4 6]
Missing values are: 0


In [70]:
#let's check it against survival rate
print(train_data[['Parch','Survived']].groupby(['Parch'], as_index = False).mean())

   Parch  Survived
0      0  0.343658
1      1  0.550847
2      2  0.500000
3      3  0.600000
4      4  0.000000
5      5  0.200000
6      6  0.000000


In [78]:
#let's take a look at Cabin
print('Total size: {}'.format(len(train_data['Cabin'])))
print('Missing values are: {}'.format(train_data['Cabin'].isna().sum()))

Total size: 891
Missing values are: 687


In [79]:
#most of the data is missing from the cabin hence it's of no use
train_data = train_data.drop(columns='Cabin')

In [98]:
#let's take a look at Ticket
print('Total unique values are: {}'.format(len(train_data['Ticket'].unique())))
print('Missing values are: {}'.format(train_data['Ticket'].isna().sum()))

Total unique values are: 681
Missing values are: 0


In [101]:
#ticket is not useful either as it contain very different type of values
train_data = train_data.drop(columns='Ticket')

In [102]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked', 'Title', 'imputer_age', 'Cat_age',
       'Cat_fare'],
      dtype='object')

In [103]:
#passengerID is of no use either 
#we already have inmputer age and mean age so we can drop age too
#we don't need fare anymore as we have converted it into category
#We have Tittle so we don't need the name anymore 
rem = ['PassengerId','Age','Fare','Name']
#creating a new dataset
X = train_data.drop(columns=rem)

In [104]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
SibSp          891 non-null int64
Parch          891 non-null int64
Embarked       891 non-null object
Title          891 non-null object
imputer_age    891 non-null float64
Cat_age        891 non-null category
Cat_fare       891 non-null category
dtypes: category(2), float64(1), int64(4), object(3)
memory usage: 58.0+ KB


In [109]:
#now we need to convert all the categorical data into numercial value through encoding
print(type(train_data['Cat_age'][0]))    #values of 'Cat_age' is of interval datatype
print(type(train_data['Cat_fare'][0]))   #values of 'Cat_fare' is of interval datatype

<class 'pandas._libs.interval.Interval'>
<class 'pandas._libs.interval.Interval'>


Now we need to encode these intervals and give them a numeric value so we can use it in our Ml algorithms