# Machine Learning Approach

First we need to download the data from Kaggle using the kaggle-cli

$ kg download -c titanic
$ kg submit mypredictions.csv -c titanic -m "My submission msg"   #when we want to submit

PassengerId -- A numerical id assigned to each passenger.
Survived -- Whether the passenger survived (1), or didn't (0). We'll be making predictions for this column.
Pclass -- The class the passenger was in -- first class (1), second class (2), or third class (3).
Name -- the name of the passenger.
Sex -- The gender of the passenger -- male or female.
Age -- The age of the passenger. Fractional.
SibSp -- The number of siblings and spouses the passenger had on board.
Parch -- The number of parents and children the passenger had on board.
Ticket -- The ticket number of the passenger.
Fare -- How much the passenger paid for the ticker.
Cabin -- Which cabin the passenger was in.
Embarked -- Where the passenger boarded the Titanic.

In [316]:
import pandas as pd
import numpy as np

DATA_DIR = "data/"

orig_data_frame = pd.read_csv(DATA_DIR+'train.csv', header=0)

### Initial Exploring

In [317]:
df = orig_data_frame.copy()
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### Add interesting columns

In [318]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df['FancyName'] = np.where(
        ((df.Name.str.contains("Master.")) 
        | (df.Name.str.contains("Rev.")) 
        | (df.Name.str.contains("Dr."))
        | (df.Name.str.contains("Dr.")) 
        | (df.Name.str.contains("Sir."))),
    1,
    0
)

df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FancyName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0


### Map Strings To Numbers

In [319]:
df['Embarked'].unique()
df['EmbarkedCode'] = df['Embarked'].map( {'S':0,'C':1,'Q':2} ) 
df['Gender'] = df['Sex'].map( {'male':0,'female':1} )

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FancyName,EmbarkedCode,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,0.0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1.0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,0.0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0.0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,0.0,0


### Handle Missing Values

In [320]:
# Looks like Age and Embarked have Nulls
df.isnull().sum()

#Set missing Embarked to most common location
df.groupby('EmbarkedCode').PassengerId.count()
df = df.fillna(value={"EmbarkedCode":0})


female_stats = df[(df.Sex=="female") & (df.Age.notnull())].groupby(['Sex','Pclass']).Age.median()
male_stats = df[(df.Sex=="male") & (df.Age.notnull())].groupby(['Sex','Pclass']).Age.median()
female_stats, male_stats

for i in xrange(df.Pclass.nunique()):
    df.loc[(df.Age.isnull()) & (df.Sex=="female") & (df.Pclass==i+1),'Age'] = female_stats[i]
    
for i in xrange(df.Pclass.nunique()):
    df.loc[(df.Age.isnull()) & (df.Sex=="male") & (df.Pclass==i+1),'Age'] = female_stats[i]
    
df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
FamilySize        0
FancyName         0
EmbarkedCode      0
Gender            0
dtype: int64

### Remove columns we don't need

In [321]:
final = df.copy()
df.columns

df = df.drop('Ticket',1)
df = df.drop('Cabin',1)
df = df.drop('Name',1)
df = df.drop('Embarked',1)
df = df.drop('Sex',1)
df = df.drop('PassengerId',1)

df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,FancyName,EmbarkedCode,Gender
0,0,3,22.0,1,0,7.25,1,0,0.0,0
1,1,1,38.0,1,0,71.2833,1,0,1.0,1
2,1,3,26.0,0,0,7.925,0,0,0.0,1
3,1,1,35.0,1,0,53.1,1,0,0.0,1
4,0,3,35.0,0,0,8.05,0,0,0.0,0


### Convert to Floats

In [322]:
column_count = len(df.columns)
df.dtypes

Survived          int64
Pclass            int64
Age             float64
SibSp             int64
Parch             int64
Fare            float64
FamilySize        int64
FancyName         int64
EmbarkedCode    float64
Gender            int64
dtype: object

In [323]:
df = df.copy()
df.dtypes

for col in df.columns:
    df[col] = df[col].astype(np.float)
    
df.dtypes
df.EmbarkedCode.mode()

0    0.0
dtype: float64

### Fare Type

In [324]:
#Bin fare types or age ranges? 

### Putting it all together

In [369]:
import pandas as pd
import numpy as np

DATA_DIR = "data/"

def prepare_data(filename):
    df = pd.read_csv(DATA_DIR+filename, header=0)

    #Add interesting columns -- these did not help
    '''
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['FancyName'] = np.where(
            ((df.Name.str.contains("Master.")) 
            | (df.Name.str.contains("Rev.")) 
            | (df.Name.str.contains("Dr."))
            | (df.Name.str.contains("Dr.")) 
            | (df.Name.str.contains("Sir."))),
        1,
        0
    )
    '''
    #Map strings to numbers
    df['EmbarkedCode'] = df['Embarked'].map( {'S':0,'C':1,'Q':2} ) 
    df['Gender'] = df['Sex'].map( {'male':0,'female':1} )
    
    #Set missing Embarked + Fare
    df = df.fillna(value={"EmbarkedCode":df.EmbarkedCode.mode()[0]})
    df = df.fillna(value={"Fare":df.Fare.mean()})

    #Set missing Age based on Sex and Pclass
    female_stats = df[(df.Sex=="female") & (df.Age.notnull())].groupby(['Sex','Pclass']).Age.median()
    for i in xrange(df.Pclass.nunique()):
        df.loc[(df.Age.isnull()) & (df.Sex=="female") & (df.Pclass==i+1),'Age'] = female_stats[i]
    male_stats = df[(df.Sex=="male") & (df.Age.notnull())].groupby(['Sex','Pclass']).Age.median()
    for i in xrange(df.Pclass.nunique()):
        df.loc[(df.Age.isnull()) & (df.Sex=="male") & (df.Pclass==i+1),'Age'] = female_stats[i]
    
    #Drop unnessary columns
    passenger_ids = df['PassengerId'].values
    df = df.drop(['Ticket', 'Cabin', 'Name', 'Embarked', 'Sex', 'PassengerId'], axis=1) 

    #Convert everything to floats
    for col in df.columns:
        df[col] = df[col].astype(np.float32)
    return df.values,passenger_ids
    
trained_data_matrix = prepare_data("train.csv")
trained_data_matrix

(array([[  0.        ,   3.        ,  22.        , ...,   7.25      ,
           0.        ,   0.        ],
        [  1.        ,   1.        ,  38.        , ...,  71.28330231,
           1.        ,   1.        ],
        [  1.        ,   3.        ,  26.        , ...,   7.92500019,
           0.        ,   1.        ],
        ..., 
        [  0.        ,   3.        ,  21.5       , ...,  23.45000076,
           0.        ,   1.        ],
        [  1.        ,   1.        ,  26.        , ...,  30.        ,
           1.        ,   0.        ],
        [  0.        ,   3.        ,  32.        , ...,   7.75      ,
           2.        ,   0.        ]], dtype=float32),
 array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  

In [370]:
#Survived must be 1st column in training set!
train_data, train_pids = prepare_data("train.csv")
test_data, test_pids = prepare_data("test.csv")

from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
predictions = forest.predict(test_data).astype(int)

### Create predictions file

In [371]:
predictions_df = pd.DataFrame({"PassengerId":test_pids,"Survived":predictions})
predictions_df.head()
predictions_df.to_csv(DATA_DIR+"forestpredictions.csv", index=False)