# Setup

In [1]:
import pandas as pd
import numpy as np

# For .read_csv, always use header=0 when you know row 0 is the header row
df = pd.read_csv('train.csv', header=0)

In [2]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
df.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Feature Engineering & Cleaning for Training Data

In [5]:
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [6]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df['EmbarkedCode'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, np.nan: 3} ).astype(int)

In [8]:
def calculate_median(dataframe, incomplete_field):
    median = np.zeros((2,3));
    for i in range(0,2):
        for j in range(0,3):
            median[i,j] = dataframe[(dataframe['Gender'] == i) & \
                                  (dataframe['Pclass'] == j+1)][incomplete_field].dropna().median();
    return median;

In [9]:
median_ages = calculate_median(df, 'Age');
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

In [10]:
median_fares = calculate_median(df, 'Fare');
median_fares

array([[ 82.66455,  22.     ,  12.475  ],
       [ 41.2625 ,  13.     ,   7.925  ]])

In [11]:
def fill(dataframe, incomplete_field, filled_field, medians):
    dataframe[filled_field] = dataframe[incomplete_field];
    for i in range(0, 2):
        for j in range(0, 3):
            dataframe.loc[ (dataframe[incomplete_field].isnull()) & (dataframe.Gender == i) & (dataframe.Pclass == j+1),\
                    filled_field] = medians[i,j];
    return;

In [12]:
df.Age
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

In [13]:
fill(df, 'Age', 'AgeFill', median_ages);
fill(df, 'Fare', 'FareFill', median_fares);

In [14]:
df['FamilySize'] = df['SibSp'] + df['Parch'];

In [15]:
df.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
       'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
       'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
       'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
       'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
       'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24',

In [16]:
df['CabinCode'] = df['Cabin'].map( lambda x: 'Z' if pd.isnull(x) else x[0].upper() );
#df.CabinCode

In [17]:
df['CabinCode'] = df['CabinCode'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'T': 8, 'Z': 9} ).astype(int)

In [18]:
df['Title'] = df.Name.map( lambda x : x.split(",")[1].split()[0] );
df.Title.unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Rev.', 'Dr.', 'Mme.',
       'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.', 'the',
       'Jonkheer.'], dtype=object)

Unique titles for test data: array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Ms.', 'Col.', 'Rev.', 'Dr.',
       'Dona.'], dtype=object)

In [19]:
def mapTitle(x):
    if "Mr" in x:
        return 0;
    if "Mrs" in x:
        return 1;
    if "Miss" in x or "Ms" in x:
        return 2;
    if "Don" in x or "Dona" in x:
        return 3;
    if "Master" in x:
        return 4;
    if "Col" in x:
        return 5;
    if "Rev" in x:
        return 6;
    if "Dr" in x:
        return 7;
    else:
        return 8;

In [20]:
df.NameCode = df.Name.map(mapTitle);

In [21]:
df = df.drop(['Name', 'Title', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title'], axis=1) 
df = df.drop(['Age', 'Fare'], axis=1)
df = df.drop(['PassengerId'], axis=1)
df = df.dropna()

In [22]:
df.describe()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Gender,EmbarkedCode,AgeFill,FareFill,FamilySize,CabinCode
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.523008,0.381594,0.647587,0.368126,29.112424,32.204208,0.904602,7.488215
std,0.486592,0.836071,1.102743,0.806057,0.477989,0.647602,13.304424,49.693429,1.613459,2.866048
min,0.0,1.0,0.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,0.0,0.0,21.5,7.9104,0.0,9.0
50%,0.0,3.0,0.0,0.0,1.0,0.0,26.0,14.4542,0.0,9.0
75%,1.0,3.0,1.0,0.0,1.0,1.0,36.0,31.0,1.0,9.0
max,1.0,3.0,8.0,6.0,1.0,3.0,80.0,512.3292,10.0,9.0


In [23]:
df.dtypes
train_data = df.values

In [24]:
test_df = pd.read_csv('test.csv', header=0)
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
fill(test_df, 'Age', 'AgeFill', median_ages);
fill(test_df, 'Fare', 'FareFill', median_ages);
test_df['EmbarkedCode'] = test_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, np.nan: 3} ).astype(int)
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
test_df['CabinCode'] = test_df['Cabin'].map( lambda x: 'Z' if pd.isnull(x) else x[0].upper() );
test_df['CabinCode'] = test_df['CabinCode'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'T': 8, 'Z': 9} ).astype(int)

In [25]:
test_df['Title'] = test_df.Name.map( lambda x : x.split(",")[1].split()[0] );
test_df.Title.unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Ms.', 'Col.', 'Rev.', 'Dr.',
       'Dona.'], dtype=object)

In [26]:
test_df.NameCode = test_df.Name.map(mapTitle);

In [27]:
ids = test_df['PassengerId'].values
test_df = test_df.drop(['Name', 'Title', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age', 'PassengerId', 'Fare'], axis=1) 
test_df.describe()

Unnamed: 0,Pclass,SibSp,Parch,Gender,AgeFill,FareFill,EmbarkedCode,FamilySize,CabinCode
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.447368,0.392344,0.636364,29.357656,35.601765,0.464115,0.839713,7.54067
std,0.841838,0.89676,0.981429,0.481623,12.957412,55.84292,0.685516,1.519072,2.844409
min,1.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,21.5,7.8958,0.0,0.0,9.0
50%,3.0,0.0,0.0,1.0,25.0,14.4542,0.0,0.0,9.0
75%,3.0,1.0,0.0,1.0,36.0,31.471875,1.0,1.0,9.0
max,3.0,8.0,9.0,1.0,76.0,512.3292,2.0,10.0,9.0


In [28]:
test_df = test_df.dropna()
test_df

Unnamed: 0,Pclass,SibSp,Parch,Gender,AgeFill,FareFill,EmbarkedCode,FamilySize,CabinCode
0,3,0,0,1,34.5,7.8292,2,0,9
1,3,1,0,0,47.0,7.0000,0,1,9
2,2,0,0,1,62.0,9.6875,2,0,9
3,3,0,0,1,27.0,8.6625,0,0,9
4,3,1,1,0,22.0,12.2875,0,2,9
5,3,0,0,1,14.0,9.2250,0,0,9
6,3,0,0,0,30.0,7.6292,2,0,9
7,2,1,1,1,26.0,29.0000,0,2,9
8,3,0,0,0,18.0,7.2292,1,0,9
9,3,2,0,1,21.0,24.1500,0,2,9


In [29]:
test_data = test_df.values
test_data

array([[ 3.,  0.,  0., ...,  2.,  0.,  9.],
       [ 3.,  1.,  0., ...,  0.,  1.,  9.],
       [ 2.,  0.,  0., ...,  2.,  0.,  9.],
       ..., 
       [ 3.,  0.,  0., ...,  0.,  0.,  9.],
       [ 3.,  0.,  0., ...,  0.,  0.,  9.],
       [ 3.,  1.,  1., ...,  1.,  2.,  9.]])

In [30]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
output = forest.predict(test_data).astype(int)

In [31]:
output

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [41]:
#Score on training data

# Create the random forest object which will include all the parameters
# for the fit
scoreforest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
scoreforest = scoreforest.fit(train_data[175::,1::],train_data[175::,0])

# Take the same decision trees and run it on the test data
scoreforest.score(train_data[::174,1::],train_data[::174,0])

1.0

In [33]:
type(output)

numpy.ndarray

In [34]:
import csv as csv

predictions_file = open("random_forest_8_27_16.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()