In [2]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [3]:

titanic = pd.read_csv('train.csv')
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"]=="male", "Sex"]=0
titanic.loc[titanic["Sex"]=="female", "Sex"]=1
titanic["Embarked"]= titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
titanic_test = pd.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

alg = RandomForestClassifier(random_state = 1, n_estimators = 10, min_samples_split = 12, min_samples_leaf = 1)
scores = cross_validation.cross_val_score(alg , titanic[predictors] , titanic["Survived"],cv=3)
scores.mean()

0.81481481481481488

In [4]:
alg = RandomForestClassifier(random_state = 1, n_estimators = 150, min_samples_split = 4, min_samples_leaf = 2)
scores = cross_validation.cross_val_score(alg , titanic[predictors] , titanic["Survived"],cv=3)
scores.mean()

0.8204264870931538

In [5]:
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))


In [6]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,NameLength
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208,0.904602,26.965208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429,1.613459,9.281607
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,12.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104,0.0,20.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,0.0,25.0
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,1.0,30.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,10.0,82.0


In [7]:
import re

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

titles = titanic["Name"].apply(get_title)
pd.value_counts(titles)

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, 
                 "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}

for k,v in title_mapping.items():
    titles[titles==k]=v

pd.value_counts(titles)

1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64

In [8]:
titanic["Title"] = titles

In [10]:
import operator

family_id_mapping = {}

def get_family_id(row):
    last_name = row["Name"].split(",")[0]
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            current_id = (max(family_id_mapping.items(), key = operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
        
    return family_id_mapping[family_id]

family_ids = titanic.apply(get_family_id, axis = 1)

family_ids[titanic["FamilySize"]<3] = -1
        
titanic["FamilyId"] = family_ids
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,NameLength,Title,FamilyId
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,0,1,23,1,-1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,1,1,51,3,-1
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,0,0,22,2,-1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,0,1,44,3,-1
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,0,0,24,1,-1


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]

# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])

# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)

scores