In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load in data

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import read_csv
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')


# Load datasets
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
whole_data = pd.concat([train_data, test_data], ignore_index=True)

In [3]:
# Look at data
whole_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Prepare data

In [4]:
# Check which columns have missing data
whole_data.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [5]:
len(whole_data)

1309

From this, I can see that the cabin has a lot of missing values and there are several missing ages. I will remove the cabin column since it is unlikely to be very informative with the majority of data being missing.

Due to the "women and children first" order, the age may be very informative and so I do not want to remove it completely. I will use averages for each title to fill in the missing ages. (It may be a good idea to reduce the number of titles but that can be investigated later).

The ticket number will be dropped as it is not informative. Cabin is mostly empty values so will be deleted.

For Embarked, I will fill in the missing data using the price of the type of ticket purchased.

And for the missing far, I will fill in the data using the ticket type and place of embarkment.

In [6]:
# Remove ticket number and passenger ID
df = whole_data
df = df.drop(labels=["Ticket"], axis=1)
print(df[(df.Embarked.isnull())])

# Both passenger with no embarkation data travelled on first class tickets costing 80.0
first_class = df.loc[df['Pclass'] == 1]
average_1_ticket = first_class.groupby('Embarked')['Fare'].median().round(0)
print(average_1_ticket)

# The median price for a first class ticket embarking from C is 77.0, therefore it is most likely both passengers embarked from C.
df['Embarked'][(df.Embarked.isnull())] = "C"

     PassengerId  Survived  Pclass                                       Name  \
61            62       1.0       1                        Icard, Miss. Amelie   
829          830       1.0       1  Stone, Mrs. George Nelson (Martha Evelyn)   

        Sex   Age  SibSp  Parch  Fare Cabin Embarked  
61   female  38.0      0      0  80.0   B28      NaN  
829  female  62.0      0      0  80.0   B28      NaN  
Embarked
C    77.0
Q    90.0
S    52.0
Name: Fare, dtype: float64


In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [8]:
# Print the row with no fare
print(df[(df.Fare.isnull())])

# They payed for a 3rd class ticket from Southampton. Let's see what the median fare for that ticket was
third_class = df.loc[df['Pclass'] == 3]
average_3_ticket = third_class.groupby('Embarked')['Fare'].median().round(0)
print(average_3_ticket)

# The median 3rd class ticket price was 8.0 from all ports. Therefore I will fill the fare price with 8.0
df['Fare'][(df.Fare.isnull())] = 8.0

      PassengerId  Survived  Pclass                Name   Sex   Age  SibSp  \
1043         1044       NaN       3  Storey, Mr. Thomas  male  60.5      0   

      Parch  Fare Cabin Embarked  
1043      0   NaN   NaN        S  
Embarked
C    8.0
Q    8.0
S    8.0
Name: Fare, dtype: float64


In [9]:
# I will now try to fill in missing age data using the mean age for each title
df['Title'] = df.Name.str.extract(r'([A-Za-z]+)\.',expand=False)
counts = df.Title.value_counts()
AgeMean_by_titles = df.groupby('Title')['Age'].mean().round(0)
temp = pd.DataFrame([counts, AgeMean_by_titles])
temp.head(20)

Unnamed: 0,Mr,Miss,Mrs,Master,Rev,Dr,Col,Mlle,Major,Ms,Lady,Sir,Mme,Don,Capt,Countess,Jonkheer,Dona
Title,757.0,260.0,197.0,61.0,8.0,8.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Age,32.0,22.0,37.0,5.0,41.0,44.0,54.0,24.0,48.0,28.0,48.0,49.0,24.0,40.0,70.0,33.0,38.0,39.0


In [10]:
# Use these values to fill in missing ages
for title in AgeMean_by_titles.index:
    df['Age'][(df.Age.isnull())] = AgeMean_by_titles[title]
    
df = df.drop(labels=["Name", "Title"], axis=1)

In [11]:
df = df.drop(labels=["Cabin"], axis=1)
# Convert cabin number into floor (A=1, B=2, C=3)
# df['Cabin'] = df['Cabin'].str[:1]
# df['Cabin'] = df['Cabin'].map({'A':1,'B':2, 'C':3, 'D':4}).astype('int')

In [12]:
# Check that there are now no null values
df.isna().sum()

PassengerId      0
Survived       418
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,male,22.0,1,0,7.25,S
1,2,1.0,1,female,38.0,1,0,71.2833,C
2,3,1.0,3,female,26.0,0,0,7.925,S
3,4,1.0,1,female,35.0,1,0,53.1,S
4,5,0.0,3,male,35.0,0,0,8.05,S


All data must be in float form (not string) so will now convert them. 

- **Sex** will now be 0 = female and 1 = male
- **Embarked** will now be 1, 2 or 3 corresponding to S, C, Q (the order in which the Titanic arrived at each port)

In [14]:
df['Sex'] = df['Sex'].map({'female':0,'male':1}).astype('int')
df['Embarked'] = df['Embarked'].map({'S':1,'C':2, 'Q':3}).astype('int')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.25,1
1,2,1.0,1,0,38.0,1,0,71.2833,2
2,3,1.0,3,0,26.0,0,0,7.925,1
3,4,1.0,1,0,35.0,1,0,53.1,1
4,5,0.0,3,1,35.0,0,0,8.05,1


In [15]:
# Finally, I will split the data back into test and train
test = df.loc[df.Survived.isnull()]
test = test.drop(labels=["Survived"], axis=1)
train = pd.concat([df, test]).drop_duplicates(keep=False)
print(len(test), len(train))

418 891


# Create models

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = train.Survived
X = train.drop(labels=["Survived"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

models = []
# Define multiple models to see which might be best
models.append(RandomForestRegressor(n_estimators=50, random_state=0))
models.append(RandomForestRegressor(n_estimators=100, random_state=0))
models.append(RandomForestRegressor(n_estimators=100, criterion='mse', random_state=0))
models.append(RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0))
models.append(RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0))
models.append(LogisticRegression())
models.append(RandomForestClassifier(n_estimators=100, bootstrap=True, criterion='entropy',
                                     min_samples_leaf=5, min_samples_split=4, random_state=42))

In [17]:
n = 0
for model in models:
    n=n+1
    model.fit(X_train,y_train)
    print("Model", n, "accuracy: ", model.score(X_test,y_test))

Model 1 accuracy:  0.46090577075098826
Model 2 accuracy:  0.4564060079051383
Model 3 accuracy:  0.4564060079051383
Model 4 accuracy:  0.5011701964645754
Model 5 accuracy:  0.4979917915363945
Model 6 accuracy:  0.8100558659217877
Model 7 accuracy:  0.8435754189944135


The best model was 7, which was a RandomForestCalssifier. I will now focus on that model.

In [18]:
model = RandomForestClassifier(n_estimators=100, bootstrap=True, criterion='entropy',
                               min_samples_leaf=5, min_samples_split=4, random_state=42)
model.fit(X_train,y_train)

cross_val_score(model, X_test, y_test, cv=5)

array([0.86111111, 0.72222222, 0.83333333, 0.83333333, 0.88571429])

In [19]:
prediction = model.predict(test).astype("int")
pred = pd.DataFrame(prediction, columns = ['Survived'])
sub = pd.concat([test_data["PassengerId"], pred],axis = 1)

sub.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [20]:
sub.to_csv('submission.csv',index = False)