# Setup

In [1]:
import pandas as pd
import numpy as np

# For .read_csv, always use header=0 when you know row 0 is the header row
df = pd.read_csv('train.csv', header=0)

In [2]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
df.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [6]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df['EmbarkedCode'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, np.nan: 3} ).astype(int)

In [8]:
median_ages = np.zeros((2,3));
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = df[(df['Gender'] == i) & \
                              (df['Pclass'] == j+1)]['Age'].dropna().median();
 
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

In [9]:
df['AgeFill'] = df['Age']

In [10]:
for i in range(0, 2):
    for j in range(0, 3):
        df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j];

# Feature Engineering

In [11]:
df['FamilySize'] = df['SibSp'] + df['Parch'];
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,EmbarkedCode,AgeFill,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,22.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,1,38.0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,26.0,0


In [12]:
df.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
       'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
       'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
       'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
       'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
       'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24',

In [13]:
df['CabinCode'] = df['Cabin'].map( lambda x: 'Z' if pd.isnull(x) else x[0].upper() );
df.CabinCode

0      Z
1      C
2      Z
3      C
4      Z
5      Z
6      E
7      Z
8      Z
9      Z
10     G
11     C
12     Z
13     Z
14     Z
15     Z
16     Z
17     Z
18     Z
19     Z
20     Z
21     D
22     Z
23     A
24     Z
25     Z
26     Z
27     C
28     Z
29     Z
      ..
861    Z
862    D
863    Z
864    Z
865    Z
866    Z
867    A
868    Z
869    Z
870    Z
871    D
872    B
873    Z
874    Z
875    Z
876    Z
877    Z
878    Z
879    C
880    Z
881    Z
882    Z
883    Z
884    Z
885    Z
886    Z
887    B
888    Z
889    C
890    Z
Name: CabinCode, dtype: object

In [14]:
df['CabinCode'] = df['CabinCode'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'T': 8, 'Z': 9} ).astype(int)

In [15]:
df['Title'] = df.Name.map( lambda x : x.split()[1] );
df.Title.unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Planke,', 'Don.', 'Rev.',
       'Billiard,', 'der', 'Walle,', 'Dr.', 'Pelsmaeker,', 'Mulder,', 'y',
       'Steen,', 'Carlo,', 'Mme.', 'Impe,', 'Ms.', 'Major.', 'Gordon,',
       'Messemaeker,', 'Mlle.', 'Col.', 'Capt.', 'Velde,', 'the',
       'Shawah,', 'Jonkheer.', 'Melkebeke,', 'Cruyssen,'], dtype=object)

In [16]:
df = df.drop(['Name', 'Title', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1) 
df = df.drop(['Age'], axis=1)
df = df.drop(['PassengerId'], axis=1)
df = df.dropna()

In [17]:
df.describe()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Gender,EmbarkedCode,AgeFill,FamilySize,CabinCode
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.523008,0.381594,32.204208,0.647587,0.368126,29.112424,0.904602,7.488215
std,0.486592,0.836071,1.102743,0.806057,49.693429,0.477989,0.647602,13.304424,1.613459,2.866048
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,0.0,7.9104,0.0,0.0,21.5,0.0,9.0
50%,0.0,3.0,0.0,0.0,14.4542,1.0,0.0,26.0,0.0,9.0
75%,1.0,3.0,1.0,0.0,31.0,1.0,1.0,36.0,1.0,9.0
max,1.0,3.0,8.0,6.0,512.3292,1.0,3.0,80.0,10.0,9.0


In [23]:
df.dtypes
train_data = df.values

In [24]:
test_df = pd.read_csv('test.csv', header=0)
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
for i in range(0, 2):
    for j in range(0, 3):
        test_df.loc[ (test_df.Age.isnull()) & (test_df.Gender == i) & (test_df.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j];
test_df['EmbarkedCode'] = test_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, np.nan: 3} ).astype(int)
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
test_df['CabinCode'] = test_df['Cabin'].map( lambda x: 'Z' if pd.isnull(x) else x[0].upper() );
test_df['CabinCode'] = test_df['CabinCode'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'T': 8, 'Z': 9} ).astype(int)
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age', 'PassengerId'], axis=1) 
test_df = test_df.dropna()
test_df

Unnamed: 0,Pclass,SibSp,Parch,Fare,Gender,AgeFill,EmbarkedCode,FamilySize,CabinCode
10,3,0,0,7.8958,1,25.0,0,0,9
22,1,0,0,31.6833,0,35.0,0,0,9
29,3,2,0,21.6792,1,25.0,1,2,9
33,3,1,2,23.4500,0,21.5,0,3,9
36,3,0,0,8.0500,0,21.5,0,0,9
39,3,0,0,56.4958,1,25.0,0,0,9
41,1,0,0,26.5500,1,40.0,0,0,3
47,3,0,0,7.7500,1,25.0,2,0,9
54,2,0,0,15.5792,1,30.0,1,0,9
58,3,1,0,16.1000,1,25.0,0,1,9


In [25]:
test_data = test_df.values

In [26]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
output = forest.predict(test_data)

In [27]:
output

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [None]:
//TODO: score
//TODO: work on cleaning titles