In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Load data

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

## Explore data

In [3]:
train_df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
628,629,0,3,"Bostandyeff, Mr. Guentcho",male,26.0,0,0,349224,7.8958,,S
196,197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q
232,233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59.0,0,0,237442,13.5,,S


In [4]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Preprocess data

In [5]:
# Set features and target
features = ['Pclass','Sex','SibSp','Parch']
target = 'Survived'

In [6]:
# One hot encoding categorical variables
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])

# Extract target
y = train_df[target]

## Train model

In [7]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, random_state=1)

## Validate

In [9]:
# Get predictions and calculate metrics
y_predicted = model.predict(X_validation)

# Get model score
mt_score = model.score(X_validation, y_validation)
mt_precision = precision_score(y_validation, y_predicted)
mt_recall = recall_score(y_validation, y_predicted)
mt_f1 = f1_score(y_validation, y_predicted)
mt_accuracy = accuracy_score(y_validation, y_predicted)

In [10]:
# Print metrics
print(f'Model precision: {mt_precision}')
print(f'Model recall: {mt_recall}')
print(f'Model f1: {mt_f1}')
print(f'Model accuracy: {mt_accuracy}')

Model precision: 0.7419354838709677
Model recall: 0.6301369863013698
Model f1: 0.6814814814814814
Model accuracy: 0.7597765363128491


## Automate validation based on precision metric

In [11]:
def get_accuracy_score(X_train, y_train, X_validation, y_validation, n_estimators, max_depth):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1)
    model.fit(X_train,y_train)
    mt_accuracy_score = model.score(X_validation, y_validation)
    return mt_accuracy_score

In [12]:
n_estimators_sets = [10, 50, 100, 150, 200, 300, 400, 500]
max_depths = range(1, 21)
max_accuracy_score = None
for n_estimators in n_estimators_sets:
  for max_depth in max_depths:
    accuracy_score = get_accuracy_score(X_train, y_train, X_validation, y_validation, n_estimators, max_depth)
    if max_accuracy_score is None or accuracy_score > max_accuracy_score:
      print('*'*64)
      print(f'New best model with n_estimators: {n_estimators}, max_depth: {max_depth}')
      print('*'*64)
      max_accuracy_score = accuracy_score
      best_n_estimators = n_estimators
      best_max_depth = max_depth

****************************************************************
New best model with n_estimators: 10, max_depth: 1
****************************************************************
****************************************************************
New best model with n_estimators: 10, max_depth: 2
****************************************************************
****************************************************************
New best model with n_estimators: 150, max_depth: 3
****************************************************************


In [13]:
# Print best model
print(f'Best model: {best_n_estimators} estimators, {best_max_depth} depth with accuracy score of {max_accuracy_score}')

Best model: 150 estimators, 3 depth with accuracy score of 0.7932960893854749


## Predictions

In [14]:
# Train model using best parameters and full set of data
model = RandomForestClassifier(n_estimators=150, max_depth=3, random_state=1)
model.fit(X,y)
predictions = model.predict(X_test)

In [15]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('output/submission-v6.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
