In [1]:
import pandas as pd
from pandas import DataFrame
import matplotlib
import os
import numpy as np

# Phase I - Data importing

In [5]:
# "df" acts as a back up Data Frame containing original data, as is - it's not used 
# directly in the analysis
df = pd.read_csv('/Users/anujgupta/Documents/pythonds/3250_Foundation of DS/Assignments/Assignment 2 - Titanic/TitanicDataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [197]:
# "df2" is the working copy of the data frame used for analysis, and contains
# only the attributes & data being used 
df2 = pd.read_csv(
    '/Users/anujgupta/Documents/pythonds/Assignments/Assignment 2 - Titanic/TitanicDataset.csv',
    usecols = ["Survived","Pclass","Sex","Age"],
    na_values = '--',
    dtype = {"Pclass":int})
    
df2.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0
5,0,3,male,
6,0,1,male,54.0
7,0,3,male,2.0
8,1,3,female,27.0
9,1,2,female,14.0


In [200]:
df2.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
dtype: int64

# Phase 2 - Data cleansing & engineering

In [201]:
df2 = df2.fillna(df2.mean())
df2.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0
5,0,3,male,29.699118
6,0,1,male,54.0
7,0,3,male,2.0
8,1,3,female,27.0
9,1,2,female,14.0


In [202]:
# Adding an attribute that will group passengers in different age groups
df2["Passenger"]="default"
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Passenger
0,0,3,male,22.0,default
1,1,1,female,38.0,default
2,1,3,female,26.0,default
3,1,1,female,35.0,default
4,0,3,male,35.0,default


In [203]:
# Converting Age into integer for sake of simplicity 
df2['Age'] = df2['Age'].astype(int)
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Passenger
0,0,3,male,22,default
1,1,1,female,38,default
2,1,3,female,26,default
3,1,1,female,35,default
4,0,3,male,35,default


In [206]:
# Categorizing passengers into different age groups to determine if a certain 
# age group was given preference over the other in rescue operation

df2.loc[df2['Age']<16, ['Passenger']] = "Child"
df2.loc[df2['Age']>=65, ['Passenger']] = "Senior"
df2.loc[(df2['Age']>=16) & (df2['Age']<25), ['Passenger']] = "Youth"
df2.loc[(df2['Age']>=25) & (df2['Age']<65), ['Passenger']] = "Adult"
df2.head(7)

Unnamed: 0,Survived,Pclass,Sex,Age,Passenger
0,0,3,male,22,Youth
1,1,1,female,38,Adult
2,1,3,female,26,Adult
3,1,1,female,35,Adult
4,0,3,male,35,Adult
5,0,3,male,29,Adult
6,0,1,male,54,Adult


In [211]:
# Adding more columns from "df" to "df2" to establish further relationships 
# And calculating total number of family members onboard for every passenger
df2["Siblings"] = df["SibSp"]  
df2["Parents"] = df["Parch"]
df2["Companions"] = df2["Siblings"] + df2["Parents"]
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Passenger,Siblings,Parents,Companions
0,0,3,male,22,Youth,1,0,1
1,1,1,female,38,Adult,1,0,1
2,1,3,female,26,Adult,0,0,0
3,1,1,female,35,Adult,1,0,1
4,0,3,male,35,Adult,0,0,0


In [212]:
# Determining whether a passenger is traveling with family or alone, to find out
# whether accompanying family impacts the probability of survival
def family(comps):
    if comps.loc['Companions']==0:
        return "Alone"
    else:
        return "With family"
    
df2['Traveling'] = df2[["Age", "Companions"]].apply(family, axis=1)
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Passenger,Siblings,Parents,Companions,Traveling
0,0,3,male,22,Youth,1,0,1,With family
1,1,1,female,38,Adult,1,0,1,With family
2,1,3,female,26,Adult,0,0,0,Alone
3,1,1,female,35,Adult,1,0,1,With family
4,0,3,male,35,Adult,0,0,0,Alone


# Phase 3 - Analysis

In [173]:
df2[['Survived', 'Pclass']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


Conclusion 1: Above analysis shows that survival rate was directly propotional with the class
              that a passenger was traveling in - hinting towards a systematic rescue approach
              focusing on Class 1, followed by Class 2 and eventually Class 3

In [217]:
survivors = df2[['Survived', 'Pclass', 'Sex', 'Traveling']].groupby(['Pclass', 'Sex', 'Traveling']).mean()
survivors

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived
Pclass,Sex,Traveling,Unnamed: 3_level_1
1,female,Alone,0.970588
1,female,With family,0.966667
1,male,Alone,0.333333
1,male,With family,0.425532
2,female,Alone,0.90625
2,female,With family,0.931818
2,male,Alone,0.097222
2,male,With family,0.277778
3,female,Alone,0.616667
3,female,With family,0.416667


In [222]:
survivors_fam = df2[['Survived','Traveling']].groupby('Traveling').mean()
survivors_fam

Unnamed: 0_level_0,Survived
Traveling,Unnamed: 1_level_1
Alone,0.303538
With family,0.50565


In [221]:
survivors_sex = df2[['Survived','Sex']].groupby('Sex').mean()
survivors_sex

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [228]:
survivors_age = df2[["Passenger", "Survived", "Sex"]].groupby(["Sex", "Passenger"]).mean()
survivors_age

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Sex,Passenger,Unnamed: 2_level_1
female,Adult,0.751269
female,Child,0.651163
female,Youth,0.77027
male,Adult,0.185185
male,Child,0.525
male,Senior,0.090909
male,Youth,0.099174


# Phase 4 - Results and answers 