# Acessing Data

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf

In [2]:
Train = pd.read_csv('trainTitanic.csv')
Test = pd.read_csv('testTitanic.csv')

In [3]:
passengerID = Test["PassengerId"]
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data visualization 

In [4]:
Train['Survived'].mean()

0.3838383838383838

Feature datatypes and info

In [5]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
Train[['Pclass', 'Survived']].groupby("Pclass").mean()

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


# Feature Extraction and Feature Engineering 

In [7]:
data_cleaner = [Train, Test]
for dataset in data_cleaner:    
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
Train['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Lady              1
Jonkheer          1
the Countess      1
Mme               1
Don               1
Capt              1
Ms                1
Sir               1
Name: Title, dtype: int64

In [9]:
Title_Dictionary = {
        "Capt":       "Officer",
        "Col":        "Officer",
        "Major":      "Officer",
        "Dr":         "Officer",
        "Rev":        "Officer",
        "Jonkheer":   "Royalty",
        "Don":        "Royalty",
        "Sir" :       "Royalty",
        "the Countess":"Royalty",
        "Dona":       "Royalty",
        "Lady" :      "Royalty",
        "Mme":        "Mrs",
        "Ms":         "Mrs",
        "Mrs" :       "Mrs",
        "Mlle":       "Miss",
        "Miss" :      "Miss",
        "Mr" :        "Mr",
        "Master" :    "Master"
                   }
Train['Title'] = Train.Title.map(Title_Dictionary)
Test['Title'] = Test.Title.map(Title_Dictionary)
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,Mr


In [10]:
Train['Title'].value_counts()

Mr         517
Miss       184
Mrs        127
Master      40
Officer     18
Royalty      5
Name: Title, dtype: int64

In [11]:
print(len(Train))
all_data_na = (Train.isnull().sum() / len(Train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
print(all_data_na)

print(len(Test))
all_data_na = (Test.isnull().sum() / len(Test)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
print(all_data_na)

891
Cabin       77.104377
Age         19.865320
Embarked     0.224467
dtype: float64
418
Cabin    78.229665
Age      20.574163
Fare      0.239234
dtype: float64


In [12]:
del Train['PassengerId']
del Train['Cabin']
del Train['Name']
del Train['Ticket']

del Test['PassengerId']
del Test['Cabin']
del Test['Name']
del Test['Ticket']


In [13]:
Train['Age'] = Train['Age'].fillna(Train['Age'].median())
Train['Embarked'] = Train['Embarked'].fillna(Train['Embarked'].mode()[0])

Test['Age'] = Test['Age'].fillna(Test['Age'].median())
Test['Fare'] = Test['Fare'].fillna(Test['Fare'].mode()[0])

In [14]:

Train.head()
Test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
0,3,male,34.5,0,0,7.8292,Q,1,1,Mr
1,3,female,47.0,1,0,7.0,S,2,0,Mrs
2,2,male,62.0,0,0,9.6875,Q,1,1,Mr
3,3,male,27.0,0,0,8.6625,S,1,1,Mr
4,3,female,22.0,1,1,12.2875,S,3,0,Mrs


In [15]:
from sklearn.preprocessing import LabelEncoder
cols = ( 'Sex', 'Embarked', 'Title')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(Train[c].values)) 
    Train[c] = lbl.transform(list(Train[c].values))

for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(Test[c].values)) 
    Test[c] = lbl.transform(list(Test[c].values))

In [17]:
Train.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208,1.536476,1.904602,0.602694,1.903479
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.791503,1.613459,0.489615,0.807061
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0,1.0,0.0,1.0
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,2.0,1.0,1.0,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0,2.0,1.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0,11.0,1.0,5.0


In [18]:
y =Train['Survived']
X = Train.drop(['Survived'], axis=1)

In [19]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
TestX = scaler.fit_transform(Test)


In [24]:
print(TestX)

[[1.         1.         0.4527232  ... 0.         1.         0.4       ]
 [1.         0.         0.61756561 ... 0.1        0.         0.6       ]
 [0.5        1.         0.8153765  ... 0.         1.         0.4       ]
 ...
 [1.         1.         0.50547277 ... 0.         1.         0.4       ]
 [1.         1.         0.35381775 ... 0.         1.         0.4       ]
 [1.         1.         0.35381775 ... 0.2        0.         0.        ]]


In [15]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(rescaledX, y)
y_predKNN = knn.predict(TestX)

In [11]:
#Decision tree model creation
destree = tree.DecisionTreeClassifier()
destree.fit(rescaledX, y)
ydecpredTree = destree.predict(TestX)

In [29]:
GBC = GradientBoostingClassifier(n_estimators=50)
GBC.fit(rescaledX, y)
ydecpredTree = GBC.predict(TestX)

In [25]:
random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(rescaledX, y)

Y_predRF = random_forest.predict(TestX)

In [30]:
submission = pd.DataFrame({
        "PassengerId": passengerID,
        "Survived": ydecpredTree
    })
submission.to_csv('titanic.csv', index=False)