In [1]:
import pandas

# We can use the pandas library in python to read in the csv file.
# This creates a pandas dataframe and assigns it to the titanic variable.
titanic = pandas.read_csv("datasets/train.csv")

# print titanic.describe()

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

# This creates a pandas dataframe and assigns it to the titanic variable.
titanic_test = pandas.read_csv("datasets/test.csv")

# Fill in all NaN ages with the median age from the train dataset.
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

# Replace all the occurences of male with the number 0 and all the occurences of female with the number 1.
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

# Replace all of the missing values with "S".
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

# Replace all of the code letters with their corresponding code integer values.
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

# Fill in all NaN ages with the median Fare from the test dataset.
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends 
# (the bottom points of the tree)

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Generating a familysize column
# titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The .apply method generates a new series
# titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))

import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
# titles = titanic["Name"].apply(get_title)
# print(pandas.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
# title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
# for k,v in title_mapping.items():
#     titles[titles == k] = v

# Verify that we converted everything.
#print(pandas.value_counts(titles))

# Add in the title column.
# titanic["Title"] = titles

import operator

# A dictionary mapping family name to id
family_id_mapping = {}

# A function to get the id given a row
def get_family_id(row):
    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if we don't have an id
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

# Get the family ids with the apply method
# family_ids = titanic.apply(get_family_id, axis=1)

# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
# family_ids[titanic["FamilySize"] < 3] = -1

# Print the count of each unique id.
# print(pandas.value_counts(family_ids))

# titanic["FamilyId"] = family_ids

import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "SibSp", "Parch"]#, "FamilySize", "Title", "FamilyId"]

# Perform feature selection
#selector = SelectKBest(f_classif, k=5)
#selector.fit(titanic[predictors], titanic["Survived"])

# Get the raw p-values for each feature, and transform from p-values into scores
#scores = -np.log10(selector.pvalues_)

# Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
#plt.bar(range(len(predictors)), scores)
#plt.xticks(range(len(predictors)), predictors, rotation='vertical')
#plt.show()

# Pick only the four best features.
#predictors = ["Pclass", "Sex", "Fare", "Title"]

In [8]:
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Initialize the cross validation folds
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    # Fit the algorithm on the training data.
    alg.fit(titanic[predictors].iloc[train,:], train_target)
    # Select and predict on the test fold.  
    # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
    test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
    # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)
# print predictions
# Compute accuracy by comparing to the training data.
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print "Sklearn accuracy: " + str(accuracy)

# for i in range(len(predictions)):
#    if predictions[i] < .5:
#        predictions[i] = 0
#    else:
#        predictions[i] = 1
# predictions = predictions.astype(int)
# submission = pandas.DataFrame({
#        "PassengerId": titanic_test["PassengerId"],
#        "Survived": predictions
#    })

# Output to a csv for submission.
# submission.to_csv("kaggle.csv", index=False)

Sklearn accuracy: 0.818181818182




In [21]:
from RandomForestRegressor import RandomForestRegressor
rfr = RandomForestRegressor(titanic, "Survived", n_trees=25, max_depth=5, predictors=predictors)

starting tree 1
0.306818181818
0.477064220183
0.389830508475
0.68085106383
0.862068965517
0.666666666667
0.327044025157
0.51724137931
0.5
0.0
finished tree 1
starting tree 2
0.329545454545
0.979591836735
0.282377919321
0.466431095406
finished tree 2
starting tree 3
0.345394736842
0.409756097561
0.694915254237
0.5
0.375
0.0
finished tree 3
starting tree 4
0.198130841121
0.0
0.8
0.774193548387
0.591549295775
0.38961038961
finished tree 4
starting tree 5
0.175742574257
0.805555555556
0.0
0.406593406593
0.408695652174
0.694915254237
0.4
0.0
0.0
finished tree 5
starting tree 6
0.188908145581
0.742038216561
finished tree 6
starting tree 7
0.188908145581
0.968085106383
0.655813953488
0.0
0.5
0.0
finished tree 7
starting tree 8
0.345394736842
0.496453900709
0.307692307692
0.0
0.866666666667
0.508771929825
finished tree 8
starting tree 9
0.383838383838
finished tree 9
starting tree 10
0.371900826446
0.135359116022
0.22641509434
0.0731707317073
0.968085106383
0.712
0.576086956522
0.0
finished tr

In [22]:
predictions = rfr.predict(titanic_test)
predictions.describe()

count    418.000000
mean       0.388870
std        0.097339
min        0.264251
25%        0.297231
50%        0.376369
75%        0.481155
max        0.578086
dtype: float64

In [23]:
for i in range(len(predictions)):
   if predictions[i] < .5:
       predictions[i] = 0
   else:
       predictions[i] = 1
predictions = predictions.astype(int)
submission = pandas.DataFrame({
       "PassengerId": titanic_test["PassengerId"],
       "Survived": predictions
   })

# Output to a csv for submission.
submission.to_csv("kaggle25.csv", index=False)

### It works!!!
We tried our algorithm with 5 trees in the forest, and we were able to get approximately 71.77% accuracy. While this is good, it's not quite as good as the sklearn random forest. When we upped it to 20 trees, we got approximately 74.641% accuracy. At 25 trees, we got approximately 75.598% accuracy.