In [1]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame
import statsmodels.api as sm

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model

In [2]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv('/Users/bnamatherdhala/Downloads/titanic/train.csv', dtype={"Age": np.float64}, )
test_df = pd.read_csv('/Users/bnamatherdhala/Downloads/titanic/test.csv', dtype={"Age": np.float64}, )

# preview the data
titanic_df.head()

titanic_df.describe()
titanic_df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [None]:
#First, we may want to view male and female numbers on board. We can use factorplot() or countplot() to show number of members in each category.
sns.factorplot('Sex',data=titanic_df,kind='count') 
#current factorplot needs to specify argument 'kind'
#or we can do sns.countplot('Sex',data=titanic_df) without specifying kind
#we may also want to see the distribution of classes on board, and in male and female.
sns.factorplot('Pclass',data=titanic_df,hue='Sex',kind='count')
sns.factorplot('Ticket', data = titanic_df, hue = "Ticket", kind = 'count')

In [None]:
#We may want to distinguish passengers on board into male, female and child, thus need to define a function.
def male_female_child(passenger):
    age,sex = passenger
    if age<10:
        return 'child'
    elif age <5:
        return 'infant'
    else:
        return sex
    


In [None]:
#Now we add a new column ‘People’ to the dataframe and apply the function.
titanic_df['People'] = titanic_df[['Age','Sex']].apply(male_female_child, axis=1) 
#note the axis needs to be specified
titanic_df[0:20]



In [None]:
sns.factorplot('People',data=titanic_df,kind='count')

In [None]:

#Now we show the classes in terms of ‘People’ column.
sns.factorplot('Pclass',data=titanic_df,hue='People',kind='count')


In [None]:
titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].median())

sns.distplot(titanic_df['Age'] , kde=False, rug=True);

In [None]:
#We may want to know the distribution of the people age on board. We can use histogram and dataframe object can call hist() directly.
titanic_df['Age'].hist(bins=70)


In [None]:
titanic_df['Gender'] = titanic_df['Sex'].map({'female':0, 'male':1}).astype(int
                                                                )

sns.distplot(titanic_df['Gender'] , kde=True, rug=True);

In [None]:
#We can see how many people in each category.
titanic_df['People'].value_counts()


#Note that we can also achieve this by using dataframe groupby function.
group1 = titanic_df.groupby[titanic.People]
group1.People.count()

In [None]:
#Now we want to see the Cabin column. Note that the first letter of the cabin denotes the cabin level (e.g. A,B,C,D,E,F,G).
#As there are null values in Cabin column, we need to drop them first, and separate this column into a Series object.
deck = titanic_df['Cabin'].dropna()
levels = []
for level in deck:
    levels.append(level[0])
cabin_df = DataFrame(levels)
cabin_df.columns = ['Cabin']
sns.factorplot('Cabin',data=cabin_df,kind='count',palette='winter_d')


In [None]:
titanic_df.info()
print("----------------------------")
test_df.info()

In [None]:
#We want to know how many passengers on board are alone (no siblings, no parents, no children), and how many are not alone. We need to put “SipSp” and “Parch” columns together, if the sum for a person is 0, then he/she is alone.
titanic_df['Alone'] = titanic_df['SibSp']+titanic_df['Parch']
titanic_df['Alone'].loc[titanic_df['Alone']>0] = "With Family"
titanic_df['Alone'].loc[titanic_df['Alone']==0] = "Alone"
sns.factorplot('Alone',data=titanic_df,kind='count',palette='Blues')


In [None]:
titanic_df[titanic_df['Age']>70][['Age','Sex','Pclass','Survived','Fare', 'Ticket']].describe()

In [None]:
sns.factorplot('Pclass','Survived',data=titanic_df)
sns.lmplot('Age','Survived',data=titanic_df)
sns.lmplot('Age','Survived',hue='Pclass',data=titanic_df,palette='winter')
generations = [10,20,40,60,80]
sns.lmplot('Age','Survived',hue='Sex',data=titanic_df,palette='winter',x_bins=generations)



In [None]:

titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df.Age.median())

titanic_df[titanic_df['Age'] > 55 ][['Age','Sex','Pclass','Survived','Fare']].describe()

In [None]:
fig = sns.FacetGrid(titanic_df,hue='Pclass',aspect=4)
fig.map(sns.kdeplot,'Age',shade='True')
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()

In [None]:



# Survived and class 
for i in range(1,4):
    print (i, ' male ' , len(titanic_df[ (titanic_df['Sex'] == 'male') & (titanic_df['Survived']==1) &(titanic_df['Pclass'] == i)  ]))
    print (i, 'female' , len(titanic_df[ (titanic_df['Sex'] == 'female') & (titanic_df['Survived']==1)&(titanic_df['Pclass'] == i) ]))

In [None]:
#Other than histogram, we can also use kdeplot() in seaborn for continuous synthetic result.
#use FacetGrid to plot multiple kdeplots on one plot
fig = sns.FacetGrid(titanic_df,hue='Sex',aspect=4)
#call FacetGrid.map() to use sns.kdeplot() to show age distribution
fig.map(sns.kdeplot,'Age',shade=True)
#set the x max limit by the oldest passenger
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()

In [None]:
for i in range(1,2):
    print (i, 'male',  len(titanic_df[ (titanic_df['Sex'] == 'male')]))
    print(i, 'female', len(titanic_df[(titanic_df['Sex']=='female')]))
    print(i, 'NaN', len(titanic_df[(titanic_df['Sex']=='NaN')]))

In [None]:
# Not Survived and class

for i in range(1,4):
    print (i, ' male ' , len(titanic_df[ (titanic_df['Sex'] == 'male') & (titanic_df['Survived']==0) &(titanic_df['Pclass'] == i)  ]))
    print (i, 'female' , len(titanic_df[ (titanic_df['Sex'] == 'female') & (titanic_df['Survived']==0)&(titanic_df['Pclass'] == i) ]))

In [None]:
for i in ['S','C','Q']:
    print (i, len(titanic_df[titanic_df['Embarked']==i]))
    
    


In [None]:
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')


In [None]:
t_male = len(titanic_df[(titanic_df['Sex']=='male')
        & (titanic_df['Survived']==1)
        & (titanic_df['Pclass']==1)
        &(titanic_df['Embarked']=='C')])
t_male

In [None]:
t_female = len(titanic_df[(titanic_df['Sex']=='female')
        & (titanic_df['Survived']==1)
        & (titanic_df['Pclass']==2)
        &(titanic_df['Embarked']=='C')])
t_female

In [None]:
# total survived %


total_male = len(titanic_df[(titanic_df['Sex'] == 'male')])
total_female = len(titanic_df[(titanic_df['Sex'] == 'female')])
num_males_survived = len(titanic_df[(titanic_df['Sex'] == 'male') & titanic_df['Survived'] == 1])
num_females_survived = len(titanic_df[(titanic_df['Sex'] != 'male') & titanic_df['Survived'] == 1])
num_males_survied_Pclass = len(titanic_df[(titanic_df['Sex']=='male') & (titanic_df['Pclass']==1) & (titanic_df['Survived'] == 1)])

print num_males_survived , ' :Total  males Survived'
print num_males_survied_Pclass, ' : Total males survived Pclass =1 '
print (num_males_survived/float(total_male ) * 100 ,'% of males survived')
print (num_males_survied_Pclass/float(num_males_survived ) * 100 ,'% of males survived Pclass')
print (num_females_survived/float(total_female) * 100 ,'% of females survived')


In [None]:
sns.regplot(x="Survived", y="Pclass", data=titanic_df);
sns.lmplot(x="Survived", y="Age", data=titanic_df);
sns.swarmplot(x="Pclass", y="Age",hue = 'Survived' ,data=titanic_df);
sns.barplot(x="Sex", y="Survived", hue="Pclass", data=titanic_df);

In [None]:
sns.barplot(x="Sex", y="Survived", hue="Pclass", data=titanic_df);

In [None]:
sns.countplot(y="Sex", data=titanic_df, color="c");

In [None]:
titanic_df['Gender'] = titanic_df['Sex'] .map({'female':0,'male':1}).astype(int)
titanic_df['Family'] = titanic_df['Parch'] + titanic_df['SibSp']
titanic_df.describe()

titanic_df.dtypes[titanic_df.dtypes.map(lambda x: x =='object')]



In [None]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
titanic_df = titanic_df.drop(['Name','Ticket','Sex','Cabin','SibSp','Parch', 'Embarked'], axis=1)
titanic_df.info()

In [None]:
def clean_up_df(df):
    """ This function will cleanup Age(Median), Sex(Change to 0,1), SibSp,Parch(Merge to Family), Embarked data
    Update to 'S' And Also deletes Name,Cabin details from titanic DF, Ensure to Pass DataFrame to this Function"""
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    df['Family'] = df['Parch'] + df['SibSp']
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df = df.drop(['SibSp','Parch','Sex','Name','Cabin','Embarked','Ticket'],axis=1)
    return df

In [None]:
test_df = clean_up_df(test_df)
test_df.info()
print("----------------------------")
titanic_df.info()

In [None]:
sns.pairplot(titanic_df,hue = 'Survived',size = 2.0)

In [None]:
# Fare

# only for test_df, since there is a missing "Fare" values
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

# convert from float to int
titanic_df['Fare'] = titanic_df['Fare'].astype(int)
test_df['Fare']    = test_df['Fare'].astype(int)

# get fare for survived & didn't survive passengers 
fare_not_survived = titanic_df["Fare"][titanic_df["Survived"] == 0]
fare_survived     = titanic_df["Fare"][titanic_df["Survived"] == 1]

# get average and std for fare of survived/not survived passengers
avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = DataFrame([fare_not_survived.std(), fare_survived.std()])

# plot
titanic_df['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50))

avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)

In [None]:
# Age 

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(16,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')

# axis3.set_title('Original Age values - Test')
# axis4.set_title('New Age values - Test')

# get average, std, and number of NaN values in titanic_df
average_age_titanic   = titanic_df["Age"].mean()
std_age_titanic       = titanic_df["Age"].std()
count_nan_age_titanic = titanic_df["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# plot original Age values
# NOTE: drop all null values, and convert to int
titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
# test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated
titanic_df["Age"][np.isnan(titanic_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2

# convert from float to int
titanic_df['Age'] = titanic_df['Age'].astype(int)
test_df['Age']    = test_df['Age'].astype(int)
        
# plot new Age Values
titanic_df['Age'].hist(bins=70, ax=axis2)
# test_df['Age'].hist(bins=70, ax=axis4)

In [None]:

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')

# axis3.set_title('Original Age values - Test')
# axis4.set_title('New Age values - Test')

# get average, std, and number of NaN values in titanic_df
average_age_titanic   = titanic_df["Age"].mean()
std_age_titanic       = titanic_df["Age"].std()
count_nan_age_titanic = titanic_df["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# plot original Age values
# NOTE: drop all null values, and convert to int
titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
# test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated
titanic_df["Age"][np.isnan(titanic_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2

# convert from float to int
titanic_df['Age'] = titanic_df['Age'].astype(int)
test_df['Age']    = test_df['Age'].astype(int)
        
# plot new Age Values
titanic_df['Age'].hist(bins=70, ax=axis2)
# test_df['Age'].hist(bins=70, ax=axis4)

In [None]:
# .... continue with plot Age column

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(titanic_df, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, titanic_df['Age'].max()))
facet.add_legend()

# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = titanic_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)

In [None]:
# define training and testing sets

X= titanic_df.drop(["Survived", 'PassengerId'],axis=1)
Y= titanic_df["Survived"]


In [None]:
# Logistic Regression

logreg = linear_model.LogisticRegression()

logreg.fit(X, Y)



logreg.score(X, Y)



In [None]:
logit = sm.Logit( Y, X)
result = logit.fit()
print result.summary()
print result.conf_int()
print np.exp(result.params)

In [None]:
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)

In [None]:
X_test = test_df.drop(['PassengerId'],axis =1)
X_test.info()


In [None]:
y_pred= logreg.predict(X_test)

In [None]:
# Support Vector Machines

# svc = SVC()

# svc.fit(X_train, Y_train)

# Y_pred = svc.predict(X_test)

# svc.score(X_train, Y_train)

In [None]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X, Y)

Y_pred = random_forest.predict(X_test)

random_forest.score(X, Y)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X, Y)

Y_pred = knn.predict(X_test)

knn.score(X, Y)

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()

gaussian.fit(X, Y)

Y_pred = gaussian.predict(X_test)

gaussian.score(X, Y)

In [None]:
# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = DataFrame(titanic_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df

In [74]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('/Users/bnamatherdhala/Downloads/titanic.csv', index=False)