### Project 2: Code for the Analysis of Titanic Data

Liang Sun

January 5, 2017

********

#### Data Wrangling

In [None]:
#Import packages and read file
#Read the data file 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%pylab inline  

from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency

filename = 'C:/Users/Liang Sun/Documents/My NanoDegree/dandp2_project/titanic-data.csv'
titanic_df = pd.read_csv(filename)

In [None]:
# Create a copy of the orignial data where "sex" is coded as 1=female and 0=male
def convert_sex(sex):
    if sex == 'female':
        sex = 1
    elif sex == 'male':
        sex = 0
    return sex

titan_df = titanic_df.copy()
titan_df['Sex'] = titan_df['Sex'].apply(convert_sex)
titan_df.head()

In [None]:
# "Pclass","Embarked" are categorical, so we get their dummies for analysis
class_dummies = pd.get_dummies(titan_df['Pclass'],prefix='class')
embark_dummies = pd.get_dummies(titan_df['Embarked'],prefix='port')
titan_df = pd.concat([titan_df,class_dummies,embark_dummies],axis=1)

****

The following codes for data analysis are arranged in the order of answering the questions stated in the project.

#### Question 1: Summary statistics

In [None]:
titan_df.describe()  #age and embarked have missing values

In [None]:
titan_df.groupby('Survived',as_index=False).describe()

#### Question 2: Relationship between gender and survival

In [None]:
#Get survival rate by gender
gender_survival = titan_df[['Survived','Sex','Name']] # 891 observations

In [None]:
 gender_survival.groupby(['Sex','Survived'],as_index=False).count()

In [None]:
#chi-squared test of "Sex" and "Survived"
obs=np.array([[468,109],[81,233]])

chi2_contingency(obs)   # these two variables are not independent

In [None]:
#Visualization: survival rate by gender

gender_surv1 = gender_survival[gender_survival['Survived']==0] 
gender_surv2 = gender_survival[gender_survival['Survived']==1]
#Generate gender summary statistics by survival for creating bar chart
no_surv = gender_surv1.groupby('Sex',as_index=False).count()
surv = gender_surv2.groupby('Sex',as_index=False).count()

n_group=2
objects=['Male','Female']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv['Survived'],bar_width,label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv['Survived'],bar_width,color='g',label='Survived')
plt.legend(fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Survival by gender',fontsize=12)
plt.savefig('survival_by_gender.png');

In [None]:
# Or, from a different perspective we can also get gender by survival
#Visualization: gender composition by survival
male = gender_survival[gender_survival['Sex']==0] 
female = gender_survival[gender_survival['Sex']==1]
#Generate survival summary statistics by gender for creating bar chart
m = male.groupby('Survived',as_index=False).count()
f = female.groupby('Survived',as_index=False).count()

n_group=2
objects=['Not survived','Survived']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,m['Sex'],bar_width,label='Male')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,f['Sex'],bar_width,color='green',label='Female')
plt.legend(fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Gender by survival',fontsize=12)
plt.savefig('gender.png');

### Question 3: Relationship between age and survival

In [None]:
#Get the mean of age by survival
age_survival = titan_df[['Survived','Age']] # 891 observations
age_surv = age_survival.dropna() #Drop missing values of age, and 714 observations are kept
age_surv.groupby('Survived',as_index=False).mean()

In [None]:
#Perform a t-test comparing the age mean between survived and non-survived
from scipy.stats import ttest_ind
surv1 = age_surv[age_surv['Survived']==0] 
surv2 = age_surv[age_surv['Survived']==1]
ttest_ind(surv1['Age'], surv2['Age'])

In [None]:
# Visualization: violin plot to show distribution of survivors and non-survivors
sns.violinplot(x='Survived',y='Age',data=age_surv)
plt.suptitle('Violin plot of age by survival')
plt.xticks([0,1], ['Not survived','Survived'])
plt.savefig('violin_age');

In [None]:
# Group age into children and adults
age_surv_kid = age_surv[age_surv['Age']<=16] 
age_surv_adult = age_surv[age_surv['Age']>16]

In [None]:
#Create a dataset with an indicator of children or adult
kid_surv=age_surv.copy()
kid_surv['kid']=''
def convert_kid(age):
    if age<=16:
        return 1
    else:
        return 0
kid_surv['kid']=kid_surv['Age'].apply(convert_kid)
kid_surv.head()

In [None]:
kid_surv.groupby(['kid','Survived'],as_index=False).count()

In [None]:
#chi-squared test of "kid" and "survived"
obs=np.array([[379,45],[235,55]])

from scipy.stats import chi2_contingency
chi2_contingency(obs)   # these two variables are not independent

In [None]:
# Visualization: bar chart of survival by children vs adult
kid_surv_no=kid_surv[kid_surv['Survived']==0]
kid_surv_yes=kid_surv[kid_surv['Survived']==1]
no_surv = kid_surv_no.groupby('kid',as_index=False).count()
surv = kid_surv_yes.groupby('kid',as_index=False).count()

n_group=2
objects=['Adult','Children']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv['Survived'],bar_width,label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv['Survived'],bar_width,color='green',label='Survived')
plt.legend(fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Survival by Age Group',fontsize=12)
plt.savefig('survival_by_kid.png');

#### Question 4: Relationship between fare and survivial


In [None]:
#Create data of survival and fare only
fare_survival=titan_df[['Survived','Fare']]

In [None]:
#t-test of the mean of fare between survivors and non-survivors
fare_surv1 = fare_survival[fare_survival['Survived']==0] 
fare_surv2 = fare_survival[fare_survival['Survived']==1]
ttest_ind(fare_surv1['Fare'], fare_surv2['Fare'])

In [None]:
#Visualization: histogram of fare by survival
plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
plt.hist(fare_surv1['Fare'],bins=[0,50,100,150,200,250,300,350])
plt.title('Not survived',fontsize=16)
plt.ylabel('Frequency',fontsize=16)
plt.xlabel('Fare',fontsize=16)
plt.ylim(0,500)
plt.xlim(0,600)

plt.subplot(1,2,2)
plt.hist(fare_surv2['Fare'])
plt.title('Survived',fontsize=16)
plt.ylim(0,500)
plt.xlabel('Fare',fontsize=16)

plt.suptitle('Distrbution of Fare by Survival',fontsize=20)
plt.savefig("hist_fare.png");

#### Question 5: Relationship between class and survivial

In [None]:
#Get the survival rate by class
class_survival = titan_df[['Survived','Pclass','Name']] # 891 observations
class_survival.groupby(['Pclass','Survived'],as_index=False).count()

In [None]:
class_survival.groupby('Pclass',as_index=False).count()

In [None]:
#chi-squared test of "Sex" and "Survived"
obs=np.array([[80,136],[97,87],[372,119]])
chi2_contingency(obs)   # these two variables are not independent

In [None]:
#Visualization: survival rate by class

class_surv1 = class_survival[class_survival['Survived']==0] 
class_surv2 = class_survival[class_survival['Survived']==1]

#Generate gender summary statistics by survival for creating bar chart
no_surv_class = class_surv1.groupby('Pclass',as_index=False).count()
surv_class = class_surv2.groupby('Pclass',as_index=False).count()

n_group=3
objects=['1st Class','2nd Class', '3rd Class']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv_class['Survived'],bar_width,label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv_class['Survived'],bar_width,color='green',label='Survived')
plt.legend(loc='best',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Survival by class',fontsize=12)
plt.savefig('survival_by_class.png');

#### Question 6: Port of embarkation and survivial

In [None]:
#Get the survival rate by port
port_survival = titan_df[['Survived','Embarked','Name']] 

In [None]:
port_survival.groupby('Embarked',as_index=False).count() #there are 2 missing values

In [None]:
port_surv=port_survival.dropna()  
port_surv.groupby(['Survived','Embarked'],as_index=False).count()

In [None]:
#Perform a chi-squared test comparing the survival rate between every two port
obs=np.array([[75,47,427],[93,30,217]])
chi2_contingency(obs)

In [None]:
#Visualization: survival by embarkation port

port_no_surv = port_survival[port_survival['Survived']==0] 
port_surv = port_survival[port_survival['Survived']==1]
#Generate gender summary statistics by survival for creating bar chart
no_surv_port = port_no_surv.groupby('Embarked',as_index=False).count()
surv_port = port_surv.groupby('Embarked',as_index=False).count()
 
n_group=3
objects=['Cherbourg','Queenstown', 'Southampton']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv_port['Survived'],bar_width,label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv_port['Survived'],bar_width,color='g',label='Survived')
plt.legend(loc='best',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Survival by port',fontsize=12)
plt.savefig('survival_by_port.png');

In [None]:
#Why did passengers who embark from Cherbourg seem to have a higher survival rate?
#Let's check the gender, age, and class of passengers from different ports
ind_df = titan_df[['Survived','Sex','Age','Pclass','Embarked','Fare']]
ind_df.groupby('Embarked',as_index=False).mean()

In [None]:
#Class is probably the main reason, that is, passengers from Cherbourg were more likely to buy first-class tickets
class1 = ind_df[ind_df['Pclass']==1] 
class2 = ind_df[ind_df['Pclass']==2] 
class3 = ind_df[ind_df['Pclass']==3] 

#Visualization: class by port
class_port1 = class1.groupby('Embarked',as_index=False).count()
class_port2 = class2.groupby('Embarked',as_index=False).count()
class_port3 = class3.groupby('Embarked',as_index=False).count() 

n_group=3
objects=['Cherbourg','Queenstown', 'Southampton']
index=np.arange(n_group)
bar_width = 0.15
plt.figure(figsize=(5,3))
plt.bar(index,class_port1['Pclass'],bar_width,color='b',label='1st Class')
plt.xticks(index+1.5*bar_width, objects)
plt.bar(index+bar_width,class_port2['Pclass'],bar_width,color='g',label='2nd Class')
plt.bar(index+2*bar_width,class_port3['Pclass'],bar_width,color='gray',label='3rd Class')

plt.legend(loc='best',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Class by port',fontsize=12)
plt.savefig('class_by_port.png');

#### Question 7: How did different factors interact in determining survival?

We may wonder that since passengers in first-class cabins had better access to lifeboats and higher survival rate while women and children in general were given priority to evacuate, did women and children in lower-class cabins have same chance to get in lifeboats as those in first-class cabins?

In [None]:
#create a data with survival,age, gender and class 
int_df = titan_df[['Survived','Pclass','Sex','Age']]

#create a subset of data for female only
fem_int_df=int_df[int_df['Sex']==1]

In [None]:
fem_int_df.groupby(['Pclass','Survived'],as_index=False).count()

In [None]:
#Perform a chi-squared test comparing survival of female by class
obs=np.array([[3,6,72],[91,70,72]])
chi2_contingency(obs)

In [None]:
#Visualization: female survival rate by class

#Generate gender summary statistics by survival for creating bar chart
no_surv=fem_int_df[fem_int_df['Survived']==0]
surv=fem_int_df[fem_int_df['Survived']==1]

no_surv_class = no_surv.groupby('Pclass',as_index=False).count()
surv_class = surv.groupby('Pclass',as_index=False).count()

n_group=3
objects=['1st Class','2nd Class', '3rd Class']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv_class['Survived'],bar_width,color='b',label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv_class['Survived'],bar_width,color='green',label='Survived')
plt.legend(loc='best',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Female survival by class',fontsize=12)
plt.savefig('female_survival_by_class.png');

In [None]:
#Interaction between age and class
#Let's define "children" as age 16 and below
kid_int_df=int_df[int_df['Age']<=16]
kid_int_df.groupby('Pclass',as_index=False).mean()

In [None]:
kid_int_df.groupby(['Pclass','Survived'],as_index=False).count()

In [None]:
#Perform a chi-squared test comparing the survival rate of children by class
obs=np.array([[1,2,42],[8,19,28]])
chi2_contingency(obs)

In [None]:
#Visualization: survival by class

#Generate gender summary statistics by survival for creating bar chart
no_surv=kid_int_df[kid_int_df['Survived']==0]
surv=kid_int_df[kid_int_df['Survived']==1]

no_surv_class = no_surv.groupby('Pclass',as_index=False).count()
surv_class = surv.groupby('Pclass',as_index=False).count()

n_group=3
objects=['1st Class','2nd Class', '3rd Class']
index=np.arange(n_group)
bar_width = 0.35
plt.figure(figsize=(5,3))
plt.bar(index,no_surv_class['Survived'],bar_width,label='Not survived')
plt.xticks(index+bar_width, objects)
plt.bar(index+bar_width,surv_class['Survived'],bar_width,color='g',label='Survived')
plt.legend(loc='best',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.suptitle('Children survival by class',fontsize=12)
plt.savefig('kid_survival_by_class.png');

#### Question 8: Family loss of passengers who survived


In [None]:
# Every observation has a ticket number and family had identical ticket number,
# so people with same ticket number and have "SibSp" or "Parch" on board can be identified as family
family_df=titan_df[['Survived','Pclass','SibSp','Parch','Ticket']]
#create a new column indicating total number of family members on board
fam_df=family_df.copy()
fam_df['fam']=fam_df['SibSp']+fam_df['Parch']
fam_df['loss']=''
fam_df.head()

In [None]:
#sort by ticket 
fam_df = fam_df.sort_values('Ticket')
fam_df.head()

In [None]:
#Get the mean of survival by ticket number
fam_sur_mean = fam_df.groupby('Ticket',as_index=False).mean()
fam_sur_mean.head()

In [None]:
temp_df = fam_sur_mean[['Ticket','Survived']]

In [None]:
temp_df = temp_df.rename(columns={ 'Survived': 'MeanSurv'})

In [None]:
fam_sur=pd.merge(fam_df,temp_df,on='Ticket')
fam_sur.head()
#If the mean of survival is 1, it means passengers with the same ticket number all survived; otherwise, at least one of them died

In [None]:
fam_sur.count()

In [None]:
# A data set containing only passenger who survived, with an indicator of whether losing family in the disaster
survivor_df=fam_sur[fam_sur['Survived']==1]
survivor_df.count()

In [None]:
#If passengers with survival mean less than 1 had family members (sibsp,parch) on board, then there was family loss for them.

fam_loss_df=survivor_df[(survivor_df['fam']>0)&(survivor_df['MeanSurv']<1)]
fam_loss_df.count()

In [None]:
#Visualization: distribution of number of family members who died in the disaster of the survivors
plt.figure(figsize=(6,4))
plt.hist(fam_loss_df['fam'],bins=6) 
plt.ylabel('Frequency',fontsize=10)
plt.xlabel('Number of family losses',fontsize=8)
plt.suptitle('Family loss of survivors',fontsize=12)
plt.savefig('hist_fam_loss');