In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


Lets load the dataset into pandas!

In [2]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Explaination of classes:

* Passenger ID: to keep traack of them
* Pclass: passenger class, aka socio economic status
* Name: prefaced by title (Mr. = adult male or young male, Mrs. = married woman, Ms = woman with marital status unknown, Master = young male, Miss = young woman or unmarried woman)
* Sex: male or female
* Age: fractional if less than 1 or if estimated
* SibSp: count of siblings/spouse onboard
* Parch: count of parent/child onboard
* Ticket: number/code, seemingly assigned based on which offices they bought it from and in what order. explained in forum [here](https://www.encyclopedia-titanica.org/community/threads/ticket-numbering-system.20348/post-267061)
* Fare: in 1970's british pounds, for the whole ticket which includes all members in group ( SibSp + Parch + 1 )
* Cabin: which room they were assigned. Letter indicates which deck (floor) they were on. A being the upper deck, G was the lowest. A map of the cabins can be found [here](https://www.encyclopedia-titanica.org/titanic-deckplans/)
* Embarked: S = Southampton (England), C = Cherbourg (France), Q = Queenstown (Ireland) (listed in chronological order

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
women = train_df.loc[train_df.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

men = train_df.loc[train_df.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [7]:
train_df['Fare'].isna().sum()

0

In [8]:
# Convert single column to int dtype.
train_df['Fare'] = train_df['Fare'].astype('int')

In [9]:
test_df['Fare'].isna().sum()

1

uh oh! theres a nan value. lets find out who it is

In [10]:
def display_options():
     
    display = pd.options.display
    display.max_columns = 20
    display.max_rows = 500
    display.max_colwidth = 222
    display.width = None
    return None
 
display_options()
display(test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


It was Mr. Storey. Why? what other info can we use to deduce his ticket price? We can't delete it because its in the testing dataset which we need to match the number of records on kaggle's answers in order to submit and get an accuracy score so lets figure out what to replace it with. Time to do some digging!

Ticket numbers are probably most dependant on where and when they bought it. aka from which office and in what order it was sold. Fare is based on the group on the same ticket number, so for example Master Arthur Rice, 10 years old, has a third class ticket costing about 29, not because his individual ticket cost that much but because he travelled with 4 siblings and 1 mother, accounted for under SibSP and Parch respectively. Profile linked [here](https://www.encyclopedia-titanica.org/titanic-victim/albert-rice.html)

So, in order to use fare as a vaild feature, we must divide by SibSp plus Parch.
The single NaN fare value, we meet a problem because it belongs to Mr. Thomas Storey who did not travel in a group. Finding his profile though, one can see that he travelled with his shipmates [link to profile here](https://www.encyclopedia-titanica.org/titanic-victim/thomas-storey.html). If this were the entire dataset, we could average their fares and use that for Mr. Storey's, however only Carver's record is in this subset, so we copy his (Fare = 7.2500). We could find the larger dataset and average the actual numbers for everyone. I'm not sure what would make best data handling practises here. We could also have just K-nearest neighbour'd based on ticker number or maybe calculated individal fare's and KNN'd that but I think this is okay because even though they are different ages (60's vs 20's) the group were all presumably of similar socioeconomic status (same job, all third class, all forced to board titanic because a strike on their original boat). also i jsut spent a long time digging up info about this guy and am tired.

In [11]:
test_df['Fare'] = test_df['Fare'].fillna(7.25)

In [12]:
test_df['Fare'].isna().sum()

0

In [13]:
test_df.sort_values(by = 'Ticket')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
335,1227,1,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S
158,1050,1,"Borebank, Mr. John James",male,42.0,0,0,110489,26.55,D22,S
236,1128,1,"Warren, Mr. Frank Manley",male,64.0,1,0,110813,75.25,D37,C
191,1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26.0,,S
266,1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0.0,,S
372,1264,1,"Ismay, Mr. Joseph Bruce",male,49.0,0,0,112058,0.0,B52 B54 B56,S
240,1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55.0,0,0,112377,27.7208,,C
368,1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45.0,0,1,112378,59.4,,C
402,1294,1,"Gibson, Miss. Dorothy Winifred",female,22.0,0,1,112378,59.4,,C
305,1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64.0,1,1,112901,26.55,B26,S


In [14]:
# Convert single column to int dtype.
test_df['Fare'] = test_df['Fare'].astype('int')

In [15]:
type(train_df['Fare'][0])

numpy.int64

Now we needd to create a new column Number of People on Ticket, or NPeople by adding sibsp, parch and 1. then divide fare / NPeople, and put that into a new column InFare (individual Fare). This is the fare we will use in our algo.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [17]:
y = train_df["Survived"]

features = ["Pclass", "Fare", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])

model = RandomForestClassifier()

model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})

output.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [18]:
param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

cv_model = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5)
cv_model.fit(X, y)

cv_model.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 500}

In [19]:
best_model = RandomForestClassifier(
    criterion = 'entropy',
    max_depth = 8,
    max_features = 'sqrt',
    n_estimators = 100,
    random_state = 1)

best_model.fit(X, y)

predictions = best_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})

output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [20]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [21]:
print(GridSearchCV(estimator = model, param_grid = param_grid, cv = 5))

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 500]})
