## Baseline RF Model

Next I established a baseline random forest using only the numerical features in the dataset.

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score

sns.set()
plt.rcParams["figure.figsize"] = (10, 8)
SEED = 42

train = pd.read_csv("assets/train.csv")
test = pd.read_csv("assets/test.csv")

X = train.copy()
y = X.pop("Survived")

In [30]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [31]:
X["Age"].fillna(X["Age"].mean(), inplace=True)
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [32]:
features = set([c for c in X.columns if X[c].dtype != "object"])
X = X.drop(columns=[c for c in X.columns if c not in features])
X.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [54]:
rf = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=SEED)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=True, random_state=42, verbose=0, warm_start=False)

In [55]:
yhat = rf.predict(X_test)
yhat

array([0.262, 0.651, 0.128, 0.9  , 0.456, 0.779, 0.649, 0.453, 0.546,
       0.468, 0.642, 0.081, 0.052, 0.042, 0.757, 0.637, 0.878, 0.057,
       0.548, 0.141, 0.151, 0.518, 0.163, 0.153, 0.093, 0.06 , 0.398,
       0.43 , 0.151, 0.415, 0.119, 0.412, 0.478, 0.059, 0.384, 0.269,
       0.519, 0.047, 0.635, 0.036, 0.238, 0.219, 0.103, 0.225, 0.204,
       0.143, 0.203, 0.073, 0.255, 0.729, 0.814, 0.742, 0.145, 0.87 ,
       0.208, 0.781, 0.677, 0.555, 0.356, 0.195, 0.158, 0.907, 0.732,
       0.266, 0.054, 0.26 , 0.344, 0.475, 0.33 , 0.816, 0.65 , 0.664,
       0.754, 0.907, 0.137, 0.417, 0.077, 0.789, 0.828, 0.416, 0.135,
       0.641, 0.791, 0.59 , 0.135, 0.25 , 0.916, 0.795, 0.021, 0.045,
       0.464, 0.027, 0.501, 0.53 , 0.14 , 0.661, 0.574, 0.098, 0.778,
       0.365, 0.12 , 0.021, 0.604, 0.017, 0.168, 0.32 , 0.213, 0.296,
       0.115, 0.353, 0.204, 0.163, 0.8  , 0.433, 0.386, 0.243, 0.186,
       0.722, 0.722, 0.114, 0.716, 0.86 , 0.632, 0.937, 0.314, 0.543,
       0.592, 0.597,

In [57]:
roc = roc_auc_score(y_test, yhat)
mse = mean_squared_error(y_test, yhat)
accuracy = rf.score(X_test, y_test) #r2
oob_score = rf.oob_score_ #oob

roc, mse, accuracy, oob_score

(0.8037323037323038,
 0.17573721787709495,
 0.2753158046332047,
 0.10515402740913815)