In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# load data
train = pd.read_csv('titanic-train.csv')
test = pd.read_csv('titanic-test.csv')

In [None]:
# explore the dataset
train.info()
# age, cabin, emabrked has null values
train.head(5)

In [None]:
# dropping unncessary columns
train.drop(['PassengerId','Ticket','Name'],axis=1,inplace='True')
test.drop(['PassengerId','Ticket','Name'],axis=1,inplace='True')

In [None]:
#check missing rows or NaN values
#print train.isnull().any()
#print train['Embarked'].isnull().value_counts()
# Value counts for embarked
print train['Embarked'].value_counts()
# number of nulls in each column of dataframe
print train.isnull().sum()

In [None]:
# data cleansing for embarked column
# Fill NA's in embarked 
train['Embarked']=train['Embarked'].fillna('S')
# Average values for Survived class for Embarked feature
sns.factorplot('Embarked','Survived', data=train,size=4,aspect=3)
# Plots
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)
# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)
sns.countplot(x='Embarked', data=train, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=train, order=[1,0], ax=axis2)
embark_perc = train[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)


# create dummies for embarked and remove one of the columns from dummies - Train
embark_dummies_titanic  = pd.get_dummies(train['Embarked'])
embark_dummies_titanic.columns = ['Embarked_'+s for s in embark_dummies_titanic.columns]
embark_dummies_titanic.drop(['Embarked_S'], axis=1, inplace=True)
## create dummies for embarked and remove one of the columns from dummies - Test
embark_dummies_test  = pd.get_dummies(test['Embarked'])
embark_dummies_test.columns = ['Embarked_'+s  for s in embark_dummies_test.columns]
embark_dummies_test.drop(['Embarked_S'], axis=1, inplace=True)

train.drop(['Embarked'],inplace='True',axis=1)
test.drop(['Embarked'],inplace='True',axis=1)
# Join with base datasets
#train = train.join(embark_dummies_titanic)
#test  = test.join(embark_dummies_test)
#train.drop(['Embarked'], axis=1,inplace=True)
#test.drop(['Embarked'], axis=1,inplace=True)

In [None]:
train.head(5)

In [None]:
# data cleansing for fare column
test['Fare'].fillna(test['Fare'].median(), inplace=True)
train.Fare=train.Fare.astype(int)
test.Fare=test.Fare.astype(int)

fare_not_survived = train.Fare[train['Survived']==0]
fare_survived=train.Fare[train['Survived']==1]

avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])
print avgerage_fare
print std_fare
avgerage_fare.columns=std_fare.columns=['Survived']
# plot
train['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50))

avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)

In [None]:
# data cleansing for age column
average_age_titanic   = train["Age"].mean()
std_age_titanic       = train["Age"].std()
count_nan_age_titanic = train["Age"].isnull().sum()

average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

print train['Age'].count()
print average_age_titanic,std_age_titanic,count_nan_age_titanic

# generate random values between mean-sd and mean+sd for train and test
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)


# find nan values and replace with random values
train["Age"][np.isnan(train["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2

train['Age']=train['Age'].astype(int)
test['Age']=test['Age'].astype(int)
# replace nan;s with median
#train['Age'].fillna(train['Age'].median(),inplace='True')
#train['Age']=train['Age'].astype(int)
#test['Age'].fillna(test['Age'].median(),inplace='True')
#test['Age']=test['Age'].astype(int)

In [None]:
# Cabin values
print train.info()
print train['Cabin'].count()
print train['Cabin'].isnull().sum()
# Since it has many null values, we can drop
train.drop(['Cabin'],inplace='True',axis=1)
test.drop(['Cabin'],inplace='True',axis=1)

In [None]:
# set value for family =1 instead of having sibsp and parch variables
train['family']=np.where((train['SibSp']==1) | (train['Parch']==1),1,0)
test['family']=np.where((test['SibSp']==1) | (test['Parch']==1),1,0)
train.drop(['SibSp','Parch'],inplace='True',axis=1)
test.drop(['SibSp','Parch'],inplace='True',axis=1)

In [None]:
# Create new sex variable
train['Sex_New'] = np.where((train.Age<=16),'Child',train.Sex)
test['Sex_New'] = np.where((test.Age<=16),'Child',test.Sex)
train.drop(['Sex'],inplace='True',axis=1)
test.drop(['Sex'],inplace='True',axis=1)


In [None]:
test.head(5)

In [None]:
# Classify sex column as Male, female and child for train and test
sex_dummies_train = pd.get_dummies(train['Sex_New'])
sex_dummies_train.columns=['Child','Female','Male']
sex_dummies_train.drop(['Male'],inplace='True',axis=1)
train.drop(['Sex_New'],inplace='True',axis=1)
train=train.join(sex_dummies)

sex_dummies_test = pd.get_dummies(test['Sex_New'])
sex_dummies_test.columns=['Child','Female','Male']
sex_dummies_test.drop(['Male'],inplace='True',axis=1)
test.drop(['Sex_New'],inplace='True',axis=1)
test=test.join(sex_dummies_test)

In [None]:
# Pclass variable
pclass_dummies_titanic  = pd.get_dummies(train['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train.drop(['Pclass'],axis=1,inplace=True)
test.drop(['Pclass'],axis=1,inplace=True)

train = train.join(pclass_dummies_titanic)
test    = test.join(pclass_dummies_test)

In [None]:
# define training and testing sets

X_train = train.drop("Survived",axis=1)
Y_train = train["Survived"]
X_test = test

In [None]:
X_train.info()

In [None]:
# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

In [None]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

In [None]:
# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = pd.DataFrame(train.columns.delete(0))
print coeff_df
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df