In [1]:
%matplotlib inline

# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import classification models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# read train and test csv's
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# merge train and test sets
titanic_df = train_df.append(test_df, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [3]:
titanic_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [4]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [5]:
# fill na's for cabin column and abbreviate values of those with cabins
titanic_df['Cabin'] = titanic_df['Cabin'].fillna('U')
titanic_df['Cabin'] = titanic_df['Cabin'].apply(lambda x: x[0])

In [6]:
titanic_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,U,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,U,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,U,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [7]:
# extract title from names
titanic_df['Title'] = titanic_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [8]:
titanic_df['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Major             2
Ms                2
Mlle              2
Mme               1
Dona              1
Capt              1
Don               1
Lady              1
Sir               1
the Countess      1
Jonkheer          1
Name: Title, dtype: int64

In [9]:
# create dictionary to label titles
generic_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

# map the generic titles to the Title column
titanic_df['Title'] = titanic_df['Title'].map(generic_titles)

# print out the value counts for the Title column
print(titanic_df['Title'].value_counts())

Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64


In [10]:
# groupby sex, pclass, and title
group = titanic_df.groupby(['Sex', 'Pclass', 'Title'])

# find median age of these three categories
group['Age'].median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        45.0
                Officer    49.0
                Royalty    39.0
        2       Miss       20.0
                Mrs        30.0
        3       Miss       18.0
                Mrs        31.0
male    1       Master      6.0
                Mr         41.5
                Officer    52.0
                Royalty    40.0
        2       Master      2.0
                Mr         30.0
                Officer    41.5
        3       Master      6.0
                Mr         26.0
Name: Age, dtype: float64

In [11]:
# apply lambda function to fill in missing ages from the group df
titanic_df['Age'] = group['Age'].apply(lambda x: x.fillna(x.median()))

In [12]:
# store most embarked in variable
most_embarked = titanic_df['Embarked'].value_counts()[0]

# apply value to the reamining missing values in Embarked column
titanic_df['Embarked'] = titanic_df['Embarked'].fillna(most_embarked)

# fill missing Fare values with the median of all Fares
titanic_df['Fare'] = titanic_df['Fare'].fillna(titanic_df['Fare'].median())

In [13]:
# check to see if all the missing values in each column have been properly filled
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Title          1309 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 133.0+ KB


In [14]:
titanic_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,U,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr
1,38.0,C,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs
2,26.0,U,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss
3,35.0,C,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs
4,35.0,U,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr


In [15]:
# create new column that combines Parch and SibSp into one column that exhibits family size (includes passenger)
titanic_df['Family'] = titanic_df['Parch'] + titanic_df['SibSp'] + 1

In [16]:
# change Sex column into 0 and 1's for machine learning model to read
titanic_df['Sex'] = titanic_df['Sex'].map({"male": 0, "female": 1})

# convert other relevant columns' values into dummy variables for machine learning model to read
pclass_dummies = pd.get_dummies(titanic_df['Pclass'], prefix = 'Pclass')
cabin_dummies = pd.get_dummies(titanic_df['Cabin'], prefix = 'Cabin')
embarked_dummies = pd.get_dummies(titanic_df['Embarked'], prefix = 'Embarked')
title_dummies = pd.get_dummies(titanic_df['Title'], prefix = 'Title')

In [17]:
# concatenate the dummie columns with titanic_df
new_titanic = pd.concat([titanic_df, pclass_dummies, cabin_dummies, embarked_dummies, title_dummies], axis = 1)

# drop the old categorical fields
new_titanic.drop(['Pclass', 'Title', 'Embarked', 'Cabin', 'Name', 'Ticket'], axis = 1, inplace = True)

In [18]:
# separate the main df into train and test 
train_idx = len(train_df)
test_idx = len(titanic_df) - len(test_df)

train = new_titanic[:train_idx]
test = new_titanic[test_idx:]

In [19]:
train.tail()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,Survived,Family,Pclass_1,Pclass_2,...,Embarked_914,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
886,27.0,13.0,0,887,0,0,0.0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
887,19.0,30.0,0,888,1,0,1.0,1,1,0,...,0,0,0,1,0,1,0,0,0,0
888,18.0,23.45,2,889,1,1,0.0,4,0,0,...,0,0,0,1,0,1,0,0,0,0
889,26.0,30.0,0,890,0,0,1.0,1,1,0,...,0,1,0,0,0,0,1,0,0,0
890,32.0,7.75,0,891,0,0,0.0,1,0,0,...,0,0,1,0,0,0,1,0,0,0


In [20]:
test.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,Survived,Family,Pclass_1,Pclass_2,...,Embarked_914,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
891,34.5,7.8292,0,892,0,0,,1,0,0,...,0,0,1,0,0,0,1,0,0,0
892,47.0,7.0,0,893,1,1,,2,0,0,...,0,0,0,1,0,0,0,1,0,0
893,62.0,9.6875,0,894,0,0,,1,0,1,...,0,0,1,0,0,0,1,0,0,0
894,27.0,8.6625,0,895,0,0,,1,0,0,...,0,0,0,1,0,0,1,0,0,0
895,22.0,12.2875,1,896,1,1,,3,0,0,...,0,0,0,1,0,0,0,1,0,0


In [21]:
# create the X and y variables for the machine learning model
X = train.drop('Survived', axis=1)
y = train['Survived']
# X_test = test.drop('Survived', axis=1)

In [22]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.27, random_state=42)

In [32]:
# instantiate RandomForestClassifier, fit the variables, and find the accuracy score
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
# pred = rf.predict(X_test)




In [33]:
# accuracy of RandomForestClaissifier model
score

0.8257261410788381

In [34]:
# put the list the features names in variable
feature_name = X.columns
feature_list = feature_name.tolist()
feature_list

['Age',
 'Fare',
 'Parch',
 'PassengerId',
 'Sex',
 'SibSp',
 'Family',
 'Pclass_1',
 'Pclass_2',
 'Pclass_3',
 'Cabin_A',
 'Cabin_B',
 'Cabin_C',
 'Cabin_D',
 'Cabin_E',
 'Cabin_F',
 'Cabin_G',
 'Cabin_T',
 'Cabin_U',
 'Embarked_914',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Title_Master',
 'Title_Miss',
 'Title_Mr',
 'Title_Mrs',
 'Title_Officer',
 'Title_Royalty']

In [35]:
# put the feature values in variable
feature_importance = rf.feature_importances_
feature_importance

array([0.13402697, 0.1472513 , 0.02328209, 0.160379  , 0.1573426 ,
       0.03059321, 0.02940199, 0.01207551, 0.01081613, 0.03400245,
       0.00269789, 0.01337526, 0.00790979, 0.00426392, 0.00991506,
       0.00251933, 0.0050001 , 0.0002378 , 0.03499635, 0.        ,
       0.01343379, 0.00611144, 0.01438361, 0.00802532, 0.00620201,
       0.11281288, 0.01175018, 0.00719401, 0.        ])

In [36]:
# create new dataframe that contains feature names and values
feature_df = pd.DataFrame({"Feature Names": feature_list, "Importance": feature_importance})
feature_df.sort_values('Importance', ascending=False).set_index('Feature Names')
feature_df

Unnamed: 0,Feature Names,Importance
0,Age,0.134027
1,Fare,0.147251
2,Parch,0.023282
3,PassengerId,0.160379
4,Sex,0.157343
5,SibSp,0.030593
6,Family,0.029402
7,Pclass_1,0.012076
8,Pclass_2,0.010816
9,Pclass_3,0.034002


In [41]:
# put the new test data for the model to predict survivors
X_test_new = test.drop('Survived', axis=1)
final_pred = rf.predict(X_test_new)

In [45]:
# convert results into a separate dataframe
results = pd.DataFrame({
    "PassengerId": passenger_id,
    "Survived": final_pred
})

In [46]:
results.head(25)

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,1.0
5,897,0.0
6,898,1.0
7,899,0.0
8,900,1.0
9,901,0.0


In [48]:
results.to_csv('titanic_predictions.csv', index=False)