# Building Predictive Models

In [1]:
import pandas as pd
import os
import numpy as np

## Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [4]:
train_df.info()
# in train data frame, we have 891 rows and 33 features; out of these 33 features, 'Survived' is the output label,
# while, the rest of 32 features will be used to build the model;

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_Z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [5]:
test_df.info()
# in test data frame, we have 418 rows and 32 features; we need to predict the survival for these passengers;

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 418 non-null    float64
 1   Fare                418 non-null    float64
 2   FamilySize          418 non-null    int64  
 3   IsMother            418 non-null    int64  
 4   IsMale              418 non-null    int64  
 5   Deck_A              418 non-null    int64  
 6   Deck_B              418 non-null    int64  
 7   Deck_C              418 non-null    int64  
 8   Deck_D              418 non-null    int64  
 9   Deck_E              418 non-null    int64  
 10  Deck_F              418 non-null    int64  
 11  Deck_G              418 non-null    int64  
 12  Deck_Z              418 non-null    int64  
 13  Pclass_1            418 non-null    int64  
 14  Pclass_2            418 non-null    int64  
 15  Pclass_3            418 non-null    int64  
 16  Title

## Data Preparation

In [8]:
# create input variable X and output variable y; 
# for X we extract all columns from 'Age' onwards, so excluding the 'Survived' column; Also, convert a dataframe to a matrix
# (use .to_numpy() function instead of .as_matrix()), and each element of a matrix to a data type -> float;
# for an output variable we create y array and for that we use 'Survived' column; using NumPy array function .ravel() we create
# a flattened one-dimensional array;
X = train_df.loc[:, 'Age':].to_numpy().astype('float')
y = train_df['Survived'].ravel()

In [10]:
# use .shape method to see the shape of the variables X (891 rows and 32 columns) and y (891 rows);
# it a common rule to use uppercase label for matrix array or multi-dimensional array and lowercase for one-dimensional
# array called a vector;

print (X.shape, y.shape)

(891, 32) (891,)


In [11]:
# train test split -> split an array X into two parts: 
# 1. X train data -> for training the model
# 2. X test data -> for evaluating our trained and predicted model performance
# firts, import a scikit-learn package for the function 'train_test_split', inside the function define arrays X, y and test size
# to be 20% (of actual training data) which will be used for model evalueation, while the rest of 80% of training data will be 
# used for model training; random_state parameter set to be zero means every time the line is executed, we get the same output;
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [12]:
# average survival in train and test
# we have around 39% of positive outcomes, which is similar in both, train and test dataset; ideally, you want positive cases
# to be evenly distributed in the train and test data.
# secondly, only 39% of data has positive cases, while the rest of 61% are negative classes. So, we have some kind of imbalance
# between the positive and negative class. In same cases, this can be a problem for evalueating the model properly.
print ('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print ('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

mean survival in train : 0.383
mean survival in test : 0.385


#### Check Scikit-Learn Version

In [13]:
import sklearn

In [15]:
sklearn.__version__

'0.23.2'

In [None]:
# we need to use DummyClassifier or .dummy function inside Scikit learn library, but it is available on versions 0.19 onwards;
# !conda update -y scikit-learn