# Random Forests in `sklearn`


In [52]:
import time
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cross_validation import cross_val_score  # train_test_split, 
import seaborn
from matplotlib import pyplot as plt
%matplotlib inline

<hr>
## Exercise

- Have a look at the Kaggle competition [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/)
- Optionally, have a particular look at
  - their [Getting Started With Random Forests](https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests) section
  - their [example in python](https://www.kaggle.com/c/titanic/download/myfirstforest.py) how to tackle this problem
- Download the trainig data [`train.csv`](https://www.kaggle.com/c/titanic/download/train.csv). You might need to create a Kaggle account
- Create useful features by adding inferred columns to the dataset
- Convert the categorical values to numerical values. For random forests, you could simply replace them by integers (`1, 2, 3, ...`), but you can also use such tools as `patsy`, `CountVectorizer` or `TfidfVectorizer`, as we have used before
- Fit a random forest to your data and compute your cross-validated accuracy
- Improve your model by playing around with the features
- If you feel lucky, you can make a submission – don't worry, your scores will be removed after a few months

In [3]:
data = pd.read_csv("/Users/ruben/Downloads/train.csv")
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [32]:
features = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 
            'Ticket', 'Fare', 'Cabin', 'Embarked']
X = data[features]
y = data.Survived

In [38]:
from patsy import dmatrix
X = dmatrix(" + ".join(features), data=data.fillna(0), return_type='dataframe')

In [47]:
model = RandomForestClassifier(n_estimators=20)
model.fit(X, y).score(X, y)

0.99214365881032551

In [48]:
cross_val_score(model, X, y)

array([ 0.73737374,  0.82491582,  0.81818182])

In [51]:
cross_val_score(model, X, y, scoring="roc_auc")

array([ 0.82010354,  0.86834915,  0.89059055])