# Random Forests Classifier Practice

## Dataset:  Titanic
### https://www.kaggle.com/c/titanic

In [48]:
import pandas as pd
from IPython.display import display, HTML
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

In [49]:
# Importing the data

X = pd.read_csv('train.csv')
y = X.pop('Survived')

In [50]:
# Taking a look at the numerical data

X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [51]:
# Notice 'Age' is missing some values, lets take care of those
# Imputing the age with the mean

X['Age'].fillna(X.Age.mean(), inplace=True)
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [52]:
# Describing categoricals

def describe_categorical(X):
    display(HTML(X[X.columns[X.dtypes == 'object']].describe().to_html()))
    
describe_categorical(X)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Graham, Mr. George Edward",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [53]:
# Drop 'PassengerId' and 'Name'

X.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [54]:
# Label Encoder for categoricals

categorical_variables = ['Sex', 'Embarked', 'Ticket', 'Cabin']

for i in categorical_variables:
    # Fill missing data
    X[i].fillna('Missing', inplace=True)
    
    # Creating dummies
    dummies = pd.get_dummies(X[i], prefix=i)
    
    # Append X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([i], axis=1, inplace=True)

In [55]:
# Test, train, split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [56]:
# Fitting and Scoring the model

model = RandomForestClassifier(777, n_jobs=-1, random_state=7)
model.fit(X_train, y_train)
print 'model score = ', model.score(X_test, y_test)

model score =  0.847533632287
