In [1]:
import pandas as pd
import numpy as np
from keras.layers import *
from keras.models import Model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
#Load Data
train_data = pd.read_csv('data/train.csv')
eval_data = pd.read_csv('data/test.csv')

print(train_data.info())
print("-"*10)
print(eval_data.info())
print(train_data.head())

#save pId's for eval data
pid = eval_data['PassengerId']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           41

In [3]:
# Check number of null and non null in data
print('Train columns with null values:\n', train_data.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', train_data.isnull().sum())
print("-"*10)

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------


In [6]:
def preprocess_data(dataset):
    #replace null value of age by median of all non-null age in dataset
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #replace null value of stations by mode of the data (since it is categorical data)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    
    
    #Add additional features to data
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['isAlone'] = 1
    dataset['isAlone'].loc[dataset['FamilySize'] > 1] = 0
    
    dataset['FareBin'] = pd.qcut(dataset['Fare'].astype(int), 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    stat_min = 10
    title_names = (dataset['Title'].value_counts() < stat_min)
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
    print(dataset['Title'].value_counts())
    print("-"*10)
    
    
    #Convert categorical data into int
    dataset['Sex'] = pd.Categorical(dataset.Sex).codes
    dataset['Embarked'] = pd.Categorical(dataset.Embarked).codes
    dataset['Title'] = pd.Categorical(dataset.Title).codes
    
    
    
    #Convert range into numerical
    label = preprocessing.LabelEncoder()
    
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    
    #Drop cols Name, PassengerID, and Cabin since they are most likely not responsible in survival
    drop_cols = ['Name', 'PassengerId', 'Cabin', 'Ticket', 'FareBin', 'AgeBin']
    dataset.drop(drop_cols, axis=1, inplace=True)
    
    return dataset

In [7]:
train_data = preprocess_data(train_data)
eval_data = preprocess_data(eval_data)

# Again check number of null values in data
print('Train columns with null values:\n', train_data.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', train_data.isnull().sum())
print("-"*10)

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
----------
Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: Title, dtype: int64
----------
Train columns with null values:
 Survived        0
Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
FamilySize      0
isAlone         0
Title           0
AgeBin_Code     0
FareBin_Code    0
dtype: int64
----------
Test/Validation columns with null values:
 Survived        0
Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
FamilySize      0
isAlone         0
Title           0
AgeBin_Code     0
FareBin_Code    0
dtype: int64
----------


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [9]:
print(train_data.shape)
train_data.head()

(891, 13)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,Title,AgeBin_Code,FareBin_Code
0,0,3,1,22.0,1,0,7.25,2,2,0,3,1,0
1,1,1,0,38.0,1,0,71.2833,0,2,0,4,2,3
2,1,3,0,26.0,0,0,7.925,2,1,1,2,1,0
3,1,1,0,35.0,1,0,53.1,2,2,0,4,2,3
4,0,3,1,35.0,0,0,8.05,2,1,1,3,2,1


In [10]:
X_train = train_data.loc[:, 'Pclass':]
Y_train = train_data['Survived']
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,isAlone,Title,AgeBin_Code,FareBin_Code
0,3,1,22.0,1,0,7.25,2,2,0,3,1,0
1,1,0,38.0,1,0,71.2833,0,2,0,4,2,3
2,3,0,26.0,0,0,7.925,2,1,1,2,1,0


In [11]:
Y_train.head()
#type(Y_train)

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

In [13]:
print(type(X_train))
print(type(Y_train))
Y_train.head()

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


140    0
439    0
817    0
378    0
491    0
Name: Survived, dtype: int64

In [14]:
parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 200, 
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6,
              'max_leaf_nodes': None}

In [15]:
RF_model = RandomForestClassifier(**parameters)

In [16]:
RF_model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
RF_predictions = RF_model.predict(X_test)

In [18]:
score = accuracy_score(Y_test ,RF_predictions)
print(score)

0.8491620111731844


In [19]:
eval_preds = RF_model.predict(eval_data)

In [20]:
eval_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [21]:
df = pd.DataFrame(eval_preds)

In [22]:
df.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [23]:
df['PassengerId'] = pid

In [24]:
df.columns = ['Survived', 'PassengerId']

In [25]:
df.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,0,896


In [27]:
df.to_csv('data/random_forest_v4.csv', index=False)