# Making predictions

### Import necessary packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

### Read data into pandas Dataframe

In [5]:
X = pd.read_csv('datasets/test.csv', index_col='PassengerId')
X.shape

(418, 10)

### Preprocessing data

In [6]:
# Convert Pclass dtype to categorical
X.Pclass = X.Pclass.astype('category', ordered=False)

#Fill Embarked column with 'None' where NA
X.Cabin.fillna('None', inplace=True)

#Fill null values in numeric columns
numeric_columns = X.select_dtypes(include='number').columns.values
X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].median())

#Fill null values in string columns
categorical_columns = X.select_dtypes(exclude='number').columns.values
X[categorical_columns] = X[categorical_columns].apply(lambda x:x.fillna(x.value_counts().index[0]))

X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Feature engineering

In [7]:
# Combine SibSp and ParCh into single column FamilySize
X['FamilySize'] = X.SibSp + X.Parch + 1

# Define IsAlome from FamilySize
X['IsAlone'] = (X.FamilySize == 1).astype('category', ordered=False)

# Convert Age to a categorical variable
X.Age.fillna(X.Age.median(), inplace=True)
X['AgeCategory'] = pd.cut(X.Age, bins=5, labels=np.arange(5)).astype('category', ordered=False)

# Convert Fare to a categorical variable
X.Fare.fillna(X.Age.median(), inplace=True)
X['FareCategory'] = pd.cut(X.Age, bins=4, labels=np.arange(4)).astype('category', ordered=False)

# Create column HasCabin from Cabin
X['HasCabin'] = (X.Cabin != 'None').astype('category', ordered=False)

X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,AgeCategory,FareCategory,HasCabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,True,2,1,False
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2,False,3,2,False
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,True,4,3,False
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,True,1,1,False
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3,False,1,1,False


### Dropping unwanted features

In [8]:
X.drop(['Name', 'Age', 'SibSp', 'Parch', 'Cabin', 'Ticket', 'Fare'], axis='columns', inplace=True)
X.head()

Unnamed: 0_level_0,Pclass,Sex,Embarked,FamilySize,IsAlone,AgeCategory,FareCategory,HasCabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,male,Q,1,True,2,1,False
893,3,female,S,2,False,3,2,False
894,2,male,Q,1,True,4,3,False
895,3,male,S,1,True,1,1,False
896,3,female,S,3,False,1,1,False


#### Encoding

In [9]:
X.Sex = LabelEncoder().fit_transform(X.Sex)
X.Embarked = LabelEncoder().fit_transform(X.Embarked)
X.Embarked = X.Embarked.astype('category', ordered=False)
X.IsAlone = LabelEncoder().fit_transform(X.IsAlone)
X.HasCabin = LabelEncoder().fit_transform(X.HasCabin)
X_encoded = pd.concat((X[['Sex', 'FamilySize', 'IsAlone', 'HasCabin']], 
           pd.get_dummies(X[['Pclass', 'Embarked', 'AgeCategory', 'FareCategory']])), axis=1)
X_encoded.head()

Unnamed: 0_level_0,Sex,FamilySize,IsAlone,HasCabin,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2,AgeCategory_0,AgeCategory_1,AgeCategory_2,AgeCategory_3,AgeCategory_4,FareCategory_0,FareCategory_1,FareCategory_2,FareCategory_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
892,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0
893,0,2,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0
894,1,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1
895,1,1,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0
896,0,3,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0


In [13]:
X_final = X_encoded.drop('AgeCategory_4', axis='columns')
X_final.shape

(418, 18)

### Load the model and make predictions

In [20]:
model = joblib.load('model.pkl')
predictions = model.predict(X_final)
output_df = pd.DataFrame({
    'PassengerId': X.index.values, 
    'Survived': predictions})

  if diff:


In [22]:
output_df.to_csv('submit.csv',index=False)