Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

Import the dataset

In [2]:
train = pd.read_csv("../datasets/Titanictrain.csv")
test = pd.read_csv("../datasets/Titanictest.csv")
#Display head
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print "Training Data"
print train.info()
print "---------------------------"
print "Test Data"
print test.info()

Training Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
---------------------------
Test Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket     

Seperate features and labels

In [4]:
train_features = train.drop(['Survived'], axis=1)
train_labels = train['Survived']

Define a function to extract useful features

In [5]:
def selectFeatures(data):
    #Add new feautres
    data['Family'] = data['SibSp'] + data['Parch'] #Total family members on board
    data.loc[data['Family'] > 0, 'Family'] = 1
    data.loc[data['Family'] == 0, 'Family'] = 0

    def get_person(passenger):
        age,sex = passenger
        if age < 15:
            return 'child'
        else:
            return sex
    
    data['Person'] = data[['Age','Sex']].apply(get_person,axis=1)
    PersonDummies  = pd.get_dummies(data['Person'])
    PersonDummies.drop(['male'], axis=1, inplace=True)
    data = data.join(PersonDummies)

    #Create dummy variable for 'Pclass' and remove Class 3 (since it's the most common value for Pclass)

    pclassDummies  = pd.get_dummies(data['Pclass'])
    pclassDummies.columns = ['Class_1','Class_2','Class_3']
    pclassDummies.drop(['Class_3'], axis=1, inplace=True)
    data = data.join(pclassDummies)

    #Create dummy variable for 'Embarked' and remove S (since it's the most common value for Embarked)

    embarkDummies = pd.get_dummies(data['Embarked'])
    embarkDummies.drop(['S'], axis=1, inplace=True)
    data = data.join(embarkDummies)

    #Remove useless features
    data.drop(['PassengerId','Name','Ticket','Cabin','Sex','SibSp','Person','Embarked','Parch','Fare'],axis=1,inplace=True)

    #Fill age data
    meanAge = data['Age'].mean()
    stdAge = data['Age'].std()
    nanAgeCount = data['Age'].isnull().sum()
    randomAges = np.random.randint(meanAge-stdAge, meanAge+stdAge, nanAgeCount)
    data["Age"][np.isnan(data["Age"])] = randomAges
    
    return data

train_features = selectFeatures(train_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fit and predict

In [6]:
model = RandomForestClassifier(min_samples_leaf=4)
model.fit(train_features, train_labels)
score = cross_validation.cross_val_score(model,train_features,train_labels,cv=6).mean()
print ("Mean score : %.6f" % score)

Mean score : 0.797962


Submission

In [7]:
test_features = selectFeatures(test)
PIDs = test['PassengerId']
predictions = model.predict(test_features)
submission = pd.DataFrame({ "PassengerID" : PIDs, "Survived" : predictions })
submission.to_csv("Results.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
