In [1]:
# Kaggle Titanic predictions for survivors

In [2]:
# imports
import numpy as np
import pandas as pd

# Sklearn to fill in the missing data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Sklearn for feature scaling
from sklearn.preprocessing import StandardScaler

# Sklearn Classifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Splitting the data
training = pd.read_csv('train.csv')
X_train = training.iloc[:, [2, 4, 5, 6, 7, 9]].values
Y_train = training.iloc[:, 1].values
testing = pd.read_csv('test.csv')
X_test = testing.iloc[:, [1, 3, 4, 5, 6, 8]].values

In [4]:
# Reshaping to a matrix
X_train = X_train.reshape(-1, 6)
X_test = X_test.reshape(-1, 6)

In [5]:
# Filling in the missing data
# Imputer is an estimator to fill in the missing values
train_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
train_imputer = train_imputer.fit(X_train[:, 2:3])
X_train[:, 2:3] = train_imputer.transform(X_train[:, 2:3])
X_train

array([[3, 'male', 22.0, 1, 0, 7.25],
       [1, 'female', 38.0, 1, 0, 71.2833],
       [3, 'female', 26.0, 0, 0, 7.925],
       ...,
       [3, 'female', 29.69911764705882, 1, 2, 23.45],
       [1, 'male', 26.0, 0, 0, 30.0],
       [3, 'male', 32.0, 0, 0, 7.75]], dtype=object)

In [6]:
test_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
test_imputer = test_imputer.fit(X_test[:, 2:6])
X_test[:, 2:6] = test_imputer.transform(X_test[:, 2:6])
X_test

array([[3, 'male', 34.5, 0.0, 0.0, 7.8292],
       [3, 'female', 47.0, 1.0, 0.0, 7.0],
       [2, 'male', 62.0, 0.0, 0.0, 9.6875],
       ...,
       [3, 'male', 38.5, 0.0, 0.0, 7.25],
       [3, 'male', 30.272590361445783, 0.0, 0.0, 8.05],
       [3, 'male', 30.272590361445783, 1.0, 1.0, 22.3583]], dtype=object)

In [7]:
# Encoding Categorical Data
labelencoder_X = LabelEncoder()
X_train[:, 1] = labelencoder_X.fit_transform(X_train[:, 1])
X_test[:, 1] = labelencoder_X.transform(X_test[:, 1])

In [8]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [9]:
# Fitting Classifier
model = RandomForestClassifier(n_estimators = 300, criterion = 'entropy')
model.fit(X_train, Y_train)

In [10]:
Y_pred = model.predict(X_test)

In [11]:
Y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,