# Training a random forest model on Titanic data

## Import the dependencies

In [None]:
import numpy as np
import pandas as pd

## Import the training data

In [None]:
df_train = pd.read_csv('../data/train.csv')
df_train.head()

## Import the test data

In [None]:
df_test = pd.read_csv('../data/test.csv')
df_test.head()

## Survivability by gender

### Percentage of women that survived

In [None]:
women = df_train.loc[df_train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

### Percentage of men that survived

In [None]:
men = df_train.loc[df_train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

## Random Forest

### Import random forest class

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Train the random forest regression model on dataset

In [None]:
y = df_train['Survived']
features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = pd.get_dummies(df_train[features])
X_test_actual = pd.get_dummies(df_test[features])

In [None]:
# rf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=5, random_state=1)
rf = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=5, random_state=1)

In [None]:
rf.fit(X, y)

In [None]:
random_pred = rf.predict([[3, 1, 0, False, True]])
# 640	0	3	Thorneycroft, Mr. Percival	male		1	0	376564	16.1		S
random_pred

In [None]:
pred = rf.predict(X_test_actual)

In [None]:
pred_data = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': pred})
pred_data.to_csv('../data/rf_predictions.csv', index=False)
print('Your submission was successfully saved!')


### Visualize the data

#### Import matplotlib

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Survived', data=df_train)
plt.title('Survival Count')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()