In [62]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [63]:
titanic_data = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")

In [64]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [65]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [66]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [67]:
TotalNa = titanic_data.isnull().sum().sort_values(ascending = False)
PercentNa = titanic_data.isnull().sum() / titanic_data.isnull().count()
PercentNa = round(PercentNa*100,1).sort_values(ascending = False)

In [68]:
missing_data = pd.concat([TotalNa, PercentNa], axis = 1, keys = ["Total Na Values", "Percentage of Na Values"])
missing_data

Unnamed: 0,Total Na Values,Percentage of Na Values
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [69]:
TotalNa = titanic_test.isnull().sum().sort_values(ascending = False)
PercentNa = titanic_test.isnull().sum() / titanic_test.isnull().count()
PercentNa = round(PercentNa*100,1).sort_values(ascending = False)

In [70]:
missing_data = pd.concat([TotalNa, PercentNa], axis = 1, keys = ["Total Na Values", "Percentage of Na Values"])
missing_data

Unnamed: 0,Total Na Values,Percentage of Na Values
Cabin,327,78.2
Age,86,20.6
Fare,1,0.2
Embarked,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


We get rid of the column "PassangerId", "Ticket", and "Name" as they would not help us in identifying those who survived. 

In [71]:
titanic_data = titanic_data.drop(columns = ["PassengerId", "Ticket", "Name"])
titanic_test = titanic_test.drop(columns = ["Ticket", "Name"])

We fill the missing "Embarked" values with the most common one, "Fare" values with the mean, and "Age" values randomly.

In [72]:
for data in [titanic_data, titanic_test]:
    mean = data["Age"].mean()
    std = data["Age"].std()
    n_nulls = data["Age"].isnull().sum()
    rng = np.random.default_rng(seed = 42)
    rand_age = rng.integers(low = mean-std, high = mean+std+1, size = n_nulls)
    age_slice = data["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    data["Age"] = age_slice.astype(int)

In [73]:
titanic_data["Embarked"].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [74]:
titanic_test["Fare"].describe()

count    417.000000
mean      35.627188
std       55.907576
min        0.000000
25%        7.895800
50%       14.454200
75%       31.500000
max      512.329200
Name: Fare, dtype: float64

In [75]:
titanic_data["Embarked"].fillna("S", inplace = True)
titanic_test["Fare"].fillna(35.627188, inplace = True)

We turn gender variables into numeric ones.

In [76]:
genders = {"male": 0, "female": 1}
for data in [titanic_data,titanic_test]:
    data["Sex"] = data["Sex"].map(genders)

We do the same thing for Embarked variables.

In [77]:
ports = {"S": 0, "C": 1, "Q": 2}
for data in [titanic_data,titanic_test]:
    data["Embarked"] = data["Embarked"].map(ports)

For Fare an Age columns, we want to turn them into numerical categories so that they are comparable to the rest of the features.

In [78]:
titanic_data["Fare"] = titanic_data["Fare"].astype(int)
titanic_test["Fare"] = titanic_test["Fare"].astype(int)

In [79]:
result_age, bins_age = pd.qcut(titanic_data["Age"], 7, retbins = True, precision = 0)
result_fare, bins_fare = pd.qcut(titanic_data["Fare"], 7, retbins = True, precision = 0)

In [80]:
result_age.value_counts()

(17.0, 22.0]    140
(22.0, 27.0]    132
(-1.0, 17.0]    131
(36.0, 44.0]    129
(27.0, 31.0]    125
(31.0, 36.0]    119
(44.0, 80.0]    115
Name: Age, dtype: int64

In [81]:
result_fare.value_counts()

(-1.0, 7.0]      241
(12.0, 19.0]     129
(56.0, 512.0]    127
(27.0, 56.0]     125
(19.0, 27.0]     124
(8.0, 12.0]       75
(7.0, 8.0]        70
Name: Fare, dtype: int64

In [82]:
for data in [titanic_data,titanic_test]:
    data.loc[ data["Age"] <= 17, "Age"] = 0
    data.loc[(data["Age"] > 17) & (data["Age"] <= 22), "Age"] = 1
    data.loc[(data["Age"] > 22) & (data["Age"] <= 27), "Age"] = 2
    data.loc[(data["Age"] > 27) & (data["Age"] <= 31), "Age"] = 3
    data.loc[(data["Age"] > 31) & (data["Age"] <= 36), "Age"] = 4
    data.loc[(data["Age"] > 36) & (data["Age"] <= 44), "Age"] = 5
    data.loc[(data["Age"] > 44), "Age"] = 6

In [83]:
for data in [titanic_data,titanic_test]:
    data.loc[ data["Fare"] <= 7, "Fare"] = 0
    data.loc[(data["Fare"] > 7) & (data["Fare"] <= 8), "Fare"] = 1
    data.loc[(data["Fare"] > 8) & (data["Fare"] <= 12), "Fare"] = 2
    data.loc[(data["Fare"] > 12) & (data["Fare"] <= 19), "Fare"] = 3
    data.loc[(data["Fare"] > 19) & (data["Fare"] <= 27), "Fare"] = 4
    data.loc[(data["Fare"] > 27) & (data["Fare"] <= 56), "Fare"] = 5
    data.loc[(data["Fare"] > 56), "Fare"] = 6

The Cabin feature has too many missing values and unique values. However by extracting the initial letter of each cell, which is the information that really matters, we will make use of this feature.

In [84]:
cabin_data = titanic_data["Cabin"].copy().astype(str)
cabin_test = titanic_test["Cabin"].copy().astype(str)

In [85]:
Set_data = set()
for word in cabin_data:
        Set_data.add(word[0])
Set_data

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'n'}

In [86]:
Set_test = set()
for word in cabin_test:
        Set_test.add(word[0])
Set_test

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'n'}

In [87]:
cabin_map = {"n": 0, "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T":8}

for i in range(len(cabin_data)):
    cabin_data[i] = cabin_data[i][0]

for i in range(len(cabin_test)):
    cabin_test[i] = cabin_test[i][0]
    
titanic_data["Cabin"] = cabin_data.map(cabin_map)
titanic_test["Cabin"] = cabin_test.map(cabin_map)

We split the data into train and test parts randomly.

In [88]:
train, test = train_test_split(titanic_data, test_size = 0.2, random_state = 10)
X_train = train.drop(columns = ["Survived"])
X_test = test.drop(columns = ["Survived"])
Y_train = train["Survived"]
Y_test = test["Survived"]

We fit logistic regression, random forest, and k-nearest neighbor models.

In [89]:
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)
acc_log = logreg.score(X_test,Y_test)

In [90]:
acc_log

0.8212290502793296

In [91]:
scores = []
for i in range(20,151,10):
    random_forest = RandomForestClassifier(n_estimators = i)
    random_forest.fit(X_train,Y_train)
    acc_random_forest = random_forest.score(X_test,Y_test)
    scores.append(acc_random_forest)

In [92]:
scores

[0.8379888268156425,
 0.8379888268156425,
 0.8603351955307262,
 0.8491620111731844,
 0.8379888268156425,
 0.8435754189944135,
 0.8379888268156425,
 0.8491620111731844,
 0.8268156424581006,
 0.8435754189944135,
 0.8212290502793296,
 0.8547486033519553,
 0.8435754189944135,
 0.8491620111731844]

In [93]:
scores = []
for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,Y_train)
    acc_knn = knn.score(X_test,Y_test)
    scores.append(acc_knn)

In [94]:
scores

[0.7877094972067039,
 0.7597765363128491,
 0.8044692737430168,
 0.8156424581005587,
 0.8156424581005587,
 0.8156424581005587,
 0.8212290502793296,
 0.8212290502793296,
 0.8212290502793296,
 0.8324022346368715,
 0.8044692737430168,
 0.8044692737430168,
 0.8156424581005587,
 0.7932960893854749,
 0.8100558659217877,
 0.8156424581005587,
 0.7988826815642458,
 0.8100558659217877,
 0.8044692737430168,
 0.7877094972067039]

It looks like random forest with 40 trees is the best performing model. So we will fit it again using all the available data and then we will use it to obtain our predictions on the test data and submit it to Kaggle.

In [95]:
X_train = titanic_data.drop(columns = ["Survived"])
Y_train = titanic_data["Survived"]
X_test = titanic_test.drop(columns = ["PassengerId"])
random_forest = RandomForestClassifier(n_estimators = 40)
random_forest.fit(X_train,Y_train)
Y_pred = random_forest.predict(X_test)

In [98]:
passenger_id = titanic_test["PassengerId"]
survived = pd.Series(Y_pred)
submit = pd.concat([passenger_id,survived], axis = 1, keys =["PassengerId", "Survived"])

In [102]:
submit.to_csv("titanic_submit.csv", index=False)