# Building Predictive Models

In [8]:
import pandas as pd
import os
import numpy as np

## Import Data

In [9]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [10]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [11]:
train_df.info()
# in train data frame, we have 891 rows and 33 features; out of these 33 features, 'Survived' is the output label,
# while, the rest of 32 features will be used to build the model;

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_Z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [12]:
test_df.info()
# in test data frame, we have 418 rows and 32 features; we need to predict the survival for these passengers;

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 418 non-null    float64
 1   Fare                418 non-null    float64
 2   FamilySize          418 non-null    int64  
 3   IsMother            418 non-null    int64  
 4   IsMale              418 non-null    int64  
 5   Deck_A              418 non-null    int64  
 6   Deck_B              418 non-null    int64  
 7   Deck_C              418 non-null    int64  
 8   Deck_D              418 non-null    int64  
 9   Deck_E              418 non-null    int64  
 10  Deck_F              418 non-null    int64  
 11  Deck_G              418 non-null    int64  
 12  Deck_Z              418 non-null    int64  
 13  Pclass_1            418 non-null    int64  
 14  Pclass_2            418 non-null    int64  
 15  Pclass_3            418 non-null    int64  
 16  Title

## Data Preparation

In [13]:
# create input variable X and output variable y; 
# for X we extract all columns from 'Age' onwards, so excluding the 'Survived' column; Also, convert a dataframe to a matrix
# (use .to_numpy() function instead of .as_matrix()), and each element of a matrix to a data type -> float;
# for an output variable we create y array and for that we use 'Survived' column; using NumPy array function .ravel() we create
# a flattened one-dimensional array;
X = train_df.loc[:, 'Age':].to_numpy().astype('float')
y = train_df['Survived'].ravel()

In [14]:
# use .shape method to see the shape of the variables X (891 rows and 32 columns) and y (891 rows);
# it a common rule to use uppercase label for matrix array or multi-dimensional array and lowercase for one-dimensional
# array called a vector;

print (X.shape, y.shape)

(891, 32) (891,)


In [15]:
# train test split -> split an array X into two parts: 
# 1. X train data -> for training the model
# 2. X test data -> for evaluating our trained and predicted model performance
# firts, import a scikit-learn package for the function 'train_test_split', inside the function define arrays X, y and test size
# to be 20% (of actual training data) which will be used for model evalueation, while the rest of 80% of training data will be 
# used for model training; random_state parameter set to be zero means every time the line is executed, we get the same output;
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [16]:
# average survival in train and test
# we have around 39% of positive outcomes, which is similar in both, train and test dataset; ideally, you want positive cases
# to be evenly distributed in the train and test data.
# secondly, only 39% of data has positive cases, while the rest of 61% are negative classes. So, we have some kind of imbalance
# between the positive and negative class. In same cases, this can be a problem for evalueating the model properly.
print ('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print ('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

mean survival in train : 0.383
mean survival in test : 0.385


#### Check Scikit-Learn Version

In [17]:
import sklearn

In [18]:
sklearn.__version__

'0.23.2'

In [19]:
# we need to use DummyClassifier or .dummy function inside Scikit learn library, but it is available on versions 0.19 onwards;
# !conda update -y scikit-learn

## Baseline Model

In [20]:
# import function DummyClassifier in order to build baseline classification model
from sklearn.dummy import DummyClassifier

In [21]:
# create model object as the most frequent, in our case it's 0 or not survived;
model_dummy = DummyClassifier (strategy='most_frequent', random_state=0)

In [22]:
# train model by using .fit function on the model object;
# X_train is the INPUT data;
# y_train is the OUTPUT data;
# we use X_
model_dummy.fit(X_train, y_train)

DummyClassifier(random_state=0, strategy='most_frequent')

In [23]:
# use .score method to evaluate the model performance on the test data;
# we pass the test data to evaluate the performance; model will first predic the output on X_test,
# then, it will compare the predicted output with the actual output y_test;
# also, for classification model, the default score represents the model accuracy;
# this is our baseline accuracy of 61%;
# also, we can calculate exsplicitely the model accuracy with the function of accuracy score, confusion matrix...
print ('score for baseline model:{0:.2f}'.format(model_dummy.score(X_test, y_test)))

score for baseline model:0.61


In [24]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
# y_test is the actual output;
# model_dummy.predict(X_test) is the predicted output;
# these are the same for all metrics below;

In [25]:
# accuracy score
print ('accuracy for baseline model:{0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

accuracy for baseline model:0.61


In [26]:
# confusion matrix
print ('confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]


In [27]:
# precision and recall scores
print ('precision for baseline model: {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))
print ('recall for baseline model: {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

precision for baseline model: 0.00
recall for baseline model: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


## First Kaggle Submission

In [28]:
# preparing the data, converting to the matrix with .to_numpy()
test_X = test_df.to_numpy().astype('float')

In [29]:
# get predictions by using the .predict method and save it as 'predictions' variable
predictions = model_dummy.predict(test_X)

In [30]:
# we need to attach the predictions to the 'PassengerId'
# we create pandas dataframe and extract the 'PassengerId' from the .index value of the test data 
# and create the column of 'Survived' where we use predictions;
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})

In [31]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [32]:
# save the file in the file path and according to the file name
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

In [33]:
# create .csv file and set the index to False (no added indexed column)
df_submission.to_csv(submission_file_path, index=False)

In [34]:
# put all of the steps of creating the submission file into one single function 'get_submission_file'
def get_submission_file (model, filename):
    # converting to the matrix
    test_X = test_df.to_numpy().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    df_submission.to_csv(submission_file_path, index=False)    

In [35]:
# get submission file
get_submission_file(model_dummy, '01_dummy.csv')

## Logistic Regression Model

In [46]:
# import library sci-kit learn and the function for LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [47]:
# create model
clf = RandomForestClassifier()
clf = LogisticRegression(solver='liblinear')
# create model object step here:
model_lr_1 = LogisticRegression(random_state=0)

In [None]:
# train model with .fit function on the model object:
model_lr_1.fit(X_train, y_train)

In [50]:
# evaluate model -> as the model get trained, we evaluate the model performance by using the .score function and 
# by passing the test data to it;
# our predicted model is better than the baseline model;
print ('score for logistic regression - version 1 : {0:.2f}'.format(model_lr_1.score(X_test, y_test)))

score for logistic regression - version 1 : 0.83


In [53]:
# performance metrics 
# accuracy
print ('accuracy for logistic regression - version 1: {0:.2f}'.format(accuracy_score(y_test, model_lr_1.predict(X_test))))
# confusion matrix
print ('confusion matrix for logistic regression - version 1: \n {0}'.format (confusion_matrix(y_test, model_lr_1.predict(X_test))))
# precision
print ('precision for logistic regression - version 1: {0:.2f}'.format(precision_score(y_test, model_lr_1.predict(X_test))))
# recall
print('recall for logistic regression - version 1: {0:.2f}'.format(recall_score(y_test, model_lr_1.predict(X_test))))

accuracy for logistic regression - version 1: 0.83
confusion matrix for logistic regression - version 1: 
 [[95 15]
 [15 54]]
precision for logistic regression - version 1: 0.78
recall for logistic regression - version 1: 0.78


In [54]:
# extract the model coefficients (or model weights or parameters)
model_lr_1.coef_

array([[-0.02367032,  0.00459391, -0.45856325,  0.42774923, -0.74536247,
         0.07698167, -0.04810058, -0.32375099,  0.45165998,  0.95470524,
         0.23766785, -0.03128023, -0.36468204,  0.81144645,  0.46934547,
        -0.32759101,  0.09309015,  1.11451687,  0.52592334, -1.56220423,
         1.07954989, -0.1103208 , -0.18735432,  0.12498004,  0.21616361,
         0.2329452 ,  0.37911205,  0.39268022,  0.47166964,  0.08885105,
         0.36215285,  0.59104806]])

## Second Kaggle Submission

In [55]:
# get submission file
get_submission_file(model_lr_1, '02_lr.csv')