# Setup

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# from xgboost import XGBClassifier

# Exploratory Data Analysis

## Dataset overview

In [3]:
df = pd.read_csv(r'.\data\train.csv')
test_data = pd.read_csv(r'.\data\test.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Gustafsson, Mr. Karl Gideon",male,1601,G6,S
freq,1,577,7,4,644


## Cleaning data
* Since <code>Cabin</code> has 687 (77.1%) missings, we will drop it.
* <code>Name</code> and <code>Ticket</code> won't be used too.

In [6]:
df_dropped = df.drop(['Cabin', 'Name', 'Ticket'], axis=1).copy()
test_data_dropped = test_data.drop(['Cabin', 'Name', 'Ticket'], axis=1).copy()
df_dropped

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


* **Solving the missings**

Traning Data

In [7]:
cols_with_missings = [col for col in df_dropped.columns if df_dropped[col].isnull().any()]
cols_with_missings

['Age', 'Embarked']

Test Data

In [8]:
cols_with_missings_test = [col for col in test_data_dropped.columns if test_data_dropped[col].isnull().any()]
cols_with_missings_test

['Age', 'Fare']

In [9]:
imputer = SimpleImputer(strategy='most_frequent')

In [10]:
df_imputed = pd.DataFrame(imputer.fit_transform(df_dropped))
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data_dropped))

In [11]:
df_imputed.columns = df_dropped.columns
test_data_imputed.columns = test_data_dropped.columns
df_imputed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22,1,0,7.25,S
1,2,1,1,female,38,1,0,71.2833,C
2,3,1,3,female,26,0,0,7.925,S
3,4,1,1,female,35,1,0,53.1,S
4,5,0,3,male,35,0,0,8.05,S


In [12]:
df_imputed.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [13]:
test_data_imputed.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

* __Data transforming__

The numeric columns had actually object dtypes, so we'll convert them first.

In [14]:
df_imputed.dtypes

PassengerId    object
Survived       object
Pclass         object
Sex            object
Age            object
SibSp          object
Parch          object
Fare           object
Embarked       object
dtype: object

In [15]:
int_cols = ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch']
float_cols = ['Fare']

test_int_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch']
test_float_cols = ['Fare']

for col in int_cols:
    df_imputed[col] = df_imputed[col].apply(lambda x: int(x))
for col in float_cols:
    df_imputed[col] = df_imputed[col].apply(lambda x: float(x))

for col in test_int_cols:
    test_data_imputed[col] = test_data_imputed[col].apply(lambda x: int(x))
for col in test_float_cols:
    test_data_imputed[col] = test_data_imputed[col].apply(lambda x: float(x))
test_data_imputed.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age              int64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [16]:
df_imputed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22,1,0,7.25,S
1,2,1,1,female,38,1,0,71.2833,C
2,3,1,3,female,26,0,0,7.925,S
3,4,1,1,female,35,1,0,53.1,S
4,5,0,3,male,35,0,0,8.05,S


In [17]:
test_data_imputed.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34,0,0,7.8292,Q
1,893,3,female,47,1,0,7.0,S
2,894,2,male,62,0,0,9.6875,Q
3,895,3,male,27,0,0,8.6625,S
4,896,3,female,22,1,1,12.2875,S


* __Label Encoding__

In [18]:
encoder = LabelEncoder()

In [19]:
df_labeled = df_imputed.copy()
test_data_labeled = test_data_imputed.copy()
object_list = df_labeled.dtypes == 'object'
test_object_list = test_data_labeled.dtypes == 'object'
object_cols = list(object_list[object_list].index)
test_object_cols = list(test_object_list[test_object_list].index)

In [20]:
for col in object_cols:
    df_labeled[col] = encoder.fit_transform(df_labeled[col])
for col in test_object_cols:
    test_data_labeled[col] = encoder.fit_transform(test_data_labeled[col])

In [21]:
df_labeled

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22,1,0,7.2500,2
1,2,1,1,0,38,1,0,71.2833,0
2,3,1,3,0,26,0,0,7.9250,2
3,4,1,1,0,35,1,0,53.1000,2
4,5,0,3,1,35,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27,0,0,13.0000,2
887,888,1,1,0,19,0,0,30.0000,2
888,889,0,3,0,24,1,2,23.4500,2
889,890,1,1,1,26,0,0,30.0000,0


In [22]:
test_data_labeled

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34,0,0,7.8292,1
1,893,3,0,47,1,0,7.0000,2
2,894,2,1,62,0,0,9.6875,1
3,895,3,1,27,0,0,8.6625,2
4,896,3,0,22,1,1,12.2875,2
...,...,...,...,...,...,...,...,...
413,1305,3,1,21,0,0,8.0500,2
414,1306,1,0,39,0,0,108.9000,0
415,1307,3,1,38,0,0,7.2500,2
416,1308,3,1,21,0,0,8.0500,2


# Modeling

In [30]:
X = df_labeled.drop(['Survived'], axis=1)
y = df_labeled['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8)

In [89]:
model = RandomForestClassifier(random_state=0, n_estimators=1000, criterion='entropy', max_depth=30)
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=30, n_estimators=1000,
                       random_state=0)

In [90]:
predictions = model.predict(X_valid)
print('Model accuracy:', accuracy_score(predictions, y_valid))

Model accuracy: 0.8324022346368715


* Fiting the model on the whole trainning data

In [91]:
model.fit(X, y)

RandomForestClassifier(criterion='entropy', max_depth=30, n_estimators=1000,
                       random_state=0)

# Testing Data

In [92]:
X_test = test_data_labeled
test_predictions = model.predict(X_test)
test_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [78]:
submission = pd.DataFrame({'PassengerId': test_data_labeled['PassengerId'],
                         'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)