# Titanic Dataset

## About the columns
1. pclass: A proxy for socio-economic status (SES)
- 1st = Upper
- 2nd = Middle
- 3rd = Lower
2. age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
3. sibsp: The dataset defines family relations in this way..., # of siblings / spouses aboard the Titanic
-  Sibling = brother, sister, stepbrother, stepsister
-  Spouse = husband, wife (mistresses and fianc√©s were ignored)
4. parch: The dataset defines family relations in this way...,# of parents / children aboard the Titanic		
-  Parent = mother, father
-  Child = daughter, son, stepdaughter, stepson
- Some children travelled only with a nanny, therefore parch=0 for them.
5. sex	Sex	- male/female	
6. ticket	Ticket number	
7. fare	Passenger fare	
8. cabin	Cabin number	
9. embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
10. Name
11. Passengerid
12. Survived

In [88]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [89]:
test.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


#### Preprocessing 'Name'
we will preprocess this as it contains some useful information , 'Mr', 'Mrs', 'Miss', etc

In [90]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [91]:
newdata = pd.DataFrame()
newdata['gender']= df['Sex']
newdata['honorifics'] = df['Name'].str.extract(r',\s(\w+)\.')
abnormal_honorifics_index_train = df.index[newdata['honorifics'].isna()]
newdata['honorifics'].unique()


array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', nan, 'Jonkheer'],
      dtype=object)

In [92]:
new = pd.DataFrame()
new['gender']= test['Sex']
new['honorifics'] = test['Name'].str.extract(r',\s(\w+)\.')
abnormal_honorifics_index_test = test.index[new['honorifics'].isna()]
new['honorifics'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [93]:
df.loc[abnormal_honorifics_index_train,'Name']

759    Rothes, the Countess. of (Lucy Noel Martha Dye...
Name: Name, dtype: object

In [94]:
test.loc[abnormal_honorifics_index_test,'Name']

Series([], Name: Name, dtype: object)

We try to figure out the honorifics by ourselves for these abnormal cases

In [95]:
newdata.at[759,'honorifics']='Countess'

note: The above categories can be converted into following 4 ones
1. Mr- Don, Sir, Jonkheer
- based on gender- Rev, Dr
2. Mrs- Mme, Ms, Lady, Mlle, Countess
- based on gender- Rev, Dr
3. Miss-
4. Master- 
5. Military_Officer-   Major, Col, Capt

In [96]:
redundant_honorifics = ['Rev', 'Dr', 'Mr','Don','Sir','Jonkheer','Mrs','Mme','Ms','Lady','Mlle','Countess','Dona']

for index, value in newdata['honorifics'].items():
    
    if newdata.at[index,'honorifics'] in redundant_honorifics:
        if newdata.at[index,'gender'] == 'female':
            newdata.at[index,'honorifics'] = 'Mrs'
        else:
            newdata.at[index,'honorifics'] = 'Mr'

    if newdata.at[index,'honorifics'] in('Major','Col','Capt'):
        newdata.at[index,'honorifics'] = 'Military_Officer'
    if newdata.at[index,'honorifics'] == 'nan':
        print(f"Missing: {index}")


In [97]:
for index, value in new['honorifics'].items():
    
    if new.at[index,'honorifics'] in redundant_honorifics:
        if new.at[index,'gender'] == 'female':
            new.at[index,'honorifics'] = 'Mrs'
        else:
            new.at[index,'honorifics'] = 'Mr'

    if new.at[index,'honorifics'] in('Major','Col','Capt'):
        new.at[index,'honorifics'] = 'Military_Officer'
    if new.at[index,'honorifics'] == 'nan':
        print(f"Missing: {index}")

In [98]:
df['Name'] = newdata['honorifics']
df['Name'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Military_Officer'], dtype=object)

In [99]:
test['Name'] = new['honorifics']
test['Name'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Military_Officer'], dtype=object)

We have preprocessed the data for 'Name'

In [100]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,5,2,,,,681.0,,147,3
top,,,,Mr,male,,,,347082.0,,B96 B98,S
freq,,,,532,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Now we deal with age
We will try to fill missing values with the average of corresponding honorific title

In [101]:
avg_by_title_train = df.groupby('Name')['Age'].transform('mean')
avg_by_title_train
# avg_age_by_honorific has the same length as the original DataFrame df. 
# Each element of this series corresponds to the mean age within the 'Honorifics' group of the respective row in df.

0      32.697816
1      35.713043
2      21.773973
3      35.713043
4      32.697816
         ...    
886    32.697816
887    21.773973
888    21.773973
889    32.697816
890    32.697816
Name: Age, Length: 891, dtype: float64

In [102]:
avg_by_title_test = test.groupby('Name')['Age'].transform('mean')
avg_by_title_test

0      32.150538
1      38.904762
2      32.150538
3      32.150538
4      38.904762
         ...    
413    32.150538
414    38.904762
415    32.150538
416    32.150538
417     7.406471
Name: Age, Length: 418, dtype: float64

In [103]:
df['Age'].fillna(avg_by_title_train, inplace=True)
#  the null values in the 'Age' column will be replaced with the average age of their respective 'Honorifics' groups.
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,204,889
unique,,,,5,2,,,,681.0,,147,3
top,,,,Mr,male,,,,347082.0,,B96 B98,S
freq,,,,532,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.784724,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.278781,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,21.773973,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,30.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.713043,1.0,0.0,,31.0,,


In [104]:
test['Age'].fillna(avg_by_title_test, inplace=True)
#  the null values in the 'Age' column will be replaced with the average age of their respective 'Honorifics' groups.
test.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,418.0,418.0,418.0,418,417.0,91,418
unique,,,5,2,,,,363,,76,3
top,,,Mr,male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,243,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.252408,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,13.019928,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,22.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,30.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,37.0,1.0,0.0,,31.5,,


Age column solved

### Now we remove Cabin, PassengerId, Fare and Ticket and also deal with pclass

In [105]:
df = df.drop(['Cabin','PassengerId', 'Fare', 'Ticket'],axis=1)

for index, value in df['Pclass'].items():
    heirarchy = df.at[index, 'Pclass']
    if heirarchy == 1:
        value_to_put = 'First'
    elif heirarchy == 2:
        value_to_put = 'Second'
    else:
        value_to_put = 'Third'
    df.at[index, 'Pclass'] = value_to_put

In [106]:
df.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
count,891.0,891,891,891,891.0,891.0,891.0,889
unique,,3,5,2,,,,3
top,,Third,Mr,male,,,,S
freq,,491,532,577,,,,644
mean,0.383838,,,,29.784724,0.523008,0.381594,
std,0.486592,,,,13.278781,1.102743,0.806057,
min,0.0,,,,0.42,0.0,0.0,
25%,0.0,,,,21.773973,0.0,0.0,
50%,0.0,,,,30.0,0.0,0.0,
75%,1.0,,,,35.713043,1.0,0.0,


In [107]:
test = test.drop(['Cabin', 'Fare', 'Ticket'],axis=1)

for index, value in test['Pclass'].items():
    heirarchy = test.at[index, 'Pclass']
    if heirarchy == 1:
        value_to_put = 'First'
    elif heirarchy == 2:
        value_to_put = 'Second'
    else:
        value_to_put = 'Third'
    test.at[index, 'Pclass'] = value_to_put

In [108]:
test.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
count,418.0,418,418,418,418.0,418.0,418.0,418
unique,,3,5,2,,,,3
top,,Third,Mr,male,,,,S
freq,,218,243,266,,,,270
mean,1100.5,,,,30.252408,0.447368,0.392344,
std,120.810458,,,,13.019928,0.89676,0.981429,
min,892.0,,,,0.17,0.0,0.0,
25%,996.25,,,,22.0,0.0,0.0,
50%,1100.5,,,,30.0,0.0,0.0,
75%,1204.75,,,,37.0,1.0,0.0,


### Now we normalize the data

In [109]:
columns_to_normalize=['Age', 'SibSp', 'Parch']
scaler= preprocessing.StandardScaler()
df[columns_to_normalize]= scaler.fit_transform(df[columns_to_normalize])
test[columns_to_normalize]= scaler.fit_transform(test[columns_to_normalize])

### Now we need to balance the data, as when the model is learning, it must have equal number of data of those you were survived and those who did not.

In [110]:
count_of_1s= np.sum(df['Survived'] == 1)
count_of_0s= np.sum(df['Survived'] == 0)
print(count_of_1s, count_of_0s)

342 549


In [111]:
# we need to delete extra records of the once who did not survive
count_of_0s = 0
indices_to_remove = []
Survived = df['Survived']
for i in range(df.shape[0]):
    if Survived[i] == 0:
        count_of_0s +=1
        if count_of_0s > count_of_1s:
            indices_to_remove.append(i)

df = df.drop(indices_to_remove)

### We save 'test_passenger_id' separately along with inputs and targets

In [112]:
test_PassengerId = test['PassengerId']
test_inputs = test.drop('PassengerId',axis =1)

### One hot encoding for the columns containing categories
the columns:- Pclass, Name, Sex, Embarked

In [113]:
df = pd.get_dummies(df , columns = ['Pclass','Name','Sex', 'Embarked'])
test_inputs = pd.get_dummies(test_inputs , columns = ['Pclass','Name','Sex', 'Embarked'])

### Splitting the data
1. we will have 90 - 10 split into training and validation
2. no split for test as we have test_data already
3. note that there are targets in test data and we need to get final accuracy from the kaggle website itself
4. We assume there is no heirarchy in the data gathering , so we do not shuffle the data

In [114]:
# on the mission to get equal priors we shuffle the data
df= df.sample(frac=1, random_state =40)

count_sample = df.shape[0]
train_count = int(0.8*count_sample)

train_targets = df['Survived'][:train_count]
validation_targets = df['Survived'][train_count:]

train_inputs = df.drop('Survived', axis=1)[:train_count]
validation_inputs = df.drop('Survived', axis=1)[train_count:]

print(np.sum(train_targets),train_count,np.sum(train_targets)/train_count)
print(np.sum(validation_targets),count_sample - train_count,np.sum(validation_targets)/(count_sample - train_count))

274 547 0.5009140767824497
68 137 0.49635036496350365


### Save all the data in tensor friendly manner 

In [115]:
np.savez('titanic_train_data', inputs= train_inputs,targets = train_targets)
np.savez('titanic_validation_data', inputs = validation_inputs, targets = validation_targets )
np.savez('titanic_test_data', inputs= test_inputs)
np.savez('titanic_test_PassengerId',inputs = test_PassengerId)

In [116]:
### now we move towards model making