In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [12]:
import os
%matplotlib inline

## Import Data

In [13]:
processed_data_path = os.path.join(os.pardir, 'data', 'processed')

train_data = os.path.join(processed_data_path, 'train.csv',)
test_data = os.path.join(processed_data_path, 'test.csv')

## Make Respecetive Data Frames

In [14]:
train_df = pd.read_csv(train_data, index_col="PassengerId")
test_df = pd.read_csv(test_data, index_col="PassengerId")

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 24 columns):
Unnamed: 0                 891 non-null int64
Age                        891 non-null float64
Fare                       891 non-null float64
isMother                   891 non-null int64
FamilySize                 891 non-null int64
IsMale                     891 non-null int64
Pclass_1                   891 non-null int64
Pclass_2                   891 non-null int64
Pclass_3                   891 non-null int64
Title_Master               891 non-null int64
Title_Miss                 891 non-null int64
Title_Mr                   891 non-null int64
Title_Mrs                  891 non-null int64
Title_Sir                  891 non-null int64
Fare_Bin_very low          891 non-null int64
Fare_Bin_low               891 non-null int64
Fare_Bin_high              891 non-null int64
Fare_Bin_extremely high    891 non-null int64
Embarked_C                 891 non-null int64
Embarked_Q       

In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 23 columns):
Unnamed: 0                 418 non-null int64
Age                        418 non-null float64
Fare                       418 non-null float64
isMother                   418 non-null int64
FamilySize                 418 non-null int64
IsMale                     418 non-null int64
Pclass_1                   418 non-null int64
Pclass_2                   418 non-null int64
Pclass_3                   418 non-null int64
Title_Master               418 non-null int64
Title_Miss                 418 non-null int64
Title_Mr                   418 non-null int64
Title_Mrs                  418 non-null int64
Title_Sir                  418 non-null int64
Fare_Bin_very low          418 non-null int64
Fare_Bin_low               418 non-null int64
Fare_Bin_high              418 non-null int64
Fare_Bin_extremely high    418 non-null int64
Embarked_C                 418 non-null int64
Embarked_Q    

# Data Preparation

In [17]:
# prepare two arrays, one input array X and one output array Y
# unlike python SLICES, PANDAS SLICES include BOTH [start:End]
# USE UPPERCASE FOR MATRIX & LOWERCASE FOR VECTOR
X = train_df.loc[:, :"AgeState_Child"].as_matrix().astype('float')

In [18]:
# ravel is numpy function which converts it into 1-dimensional array
y = train_df.Survived.ravel()

In [19]:
print("Shape of X:{}    Shape of y:{}".format(X.shape, y.shape))

Shape of X:(891, 23)    Shape of y:(891,)


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(712, 23) (712,) (179, 23) (179,)


In [23]:
# our train data must be balanced i.e 50% +ve value, 50% -ve value
# even some time this is not possible, but it must be looked for
print("Y test mean {}".format(np.mean(y_test)))
print("Y train mean {}".format(np.mean(y_train)))

Y test mean 0.3854748603351955
Y train mean 0.38342696629213485


# Baseline Model

In [24]:
from sklearn.dummy import DummyClassifier

In [25]:
model_dummy = DummyClassifier(strategy="most_frequent", random_state=0)

In [26]:
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [27]:
# Model ACCURACY
model_dummy.score(X_test, y_test)

0.6145251396648045

## Performance Metrics

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [29]:
print("Accuracy of Baseline model is : {}".format(accuracy_score(y_test, model_dummy.predict(X_test))))

Accuracy of Baseline model is : 0.6145251396648045


In [30]:
print("Precision of Baseline model is : {}".format(precision_score(y_test, model_dummy.predict(X_test))))

Precision of Baseline model is : 0.0


  'precision', 'predicted', average, warn_for)


In [31]:
print("Recall of Baseline model is : {}".format(recall_score(y_test, model_dummy.predict(X_test))))

Recall of Baseline model is : 0.0


In [32]:
print("Confusion Matrix of Baseline model is : {}".format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion Matrix of Baseline model is : [[110   0]
 [ 69   0]]


# First Kaggle Submission

In [33]:
test_X = test_df.as_matrix().astype("float")

In [34]:
# get predictions
predictions = model_dummy.predict(test_X)

In [35]:
df_submit = pd.DataFrame({'PassengerId':test_df.index, 'Survived': predictions})

In [36]:
df_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [37]:
submit_path = os.path.join(os.pardir, "data", "external")
submit_file_path = os.path.join(submit_path, '01_submit.csv')

In [38]:
# index=False --> so that no extra column is added
df_submit.to_csv(submit_file_path, index=False)

In [39]:
def get_submission_file(model, filename):
    # create a test matrix with float values
    test_X = test_df.as_matrix().astype("float")
    print(test_X.shape)
    # make prediction on the model
    predictions = model.predict(test_X)
    # create a dataframe to submit
    df_submit = pd.DataFrame({'PassengerId':test_df.index, 'Survived': predictions})
    # define submit path and save the data frame in csv format
    submit_path = os.path.join(os.pardir, "data", "external")
    submit_file_path = os.path.join(submit_path, filename)
    # index=False --> so that no extra column is added
    df_submit.to_csv(submit_file_path, index=False)

In [40]:
get_submission_file(model_dummy, '01_submit.csv')

(418, 23)


# Logistic Regression Model

In [41]:
from sklearn.linear_model import LogisticRegression

In [42]:
model_LR = LogisticRegression(random_state=0)

In [43]:
model_LR.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
print("Accuracy Score of Logistic model is : {}".format(accuracy_score(y_test, model_LR.predict(X_test))))
print("Confusion Matrix of Logistic model is : {}".format(confusion_matrix(y_test, model_LR.predict(X_test))))
print("Recall of Logistic model is : {}".format(recall_score(y_test, model_LR.predict(X_test))))
print("Precision of Logistic model is : {}".format(precision_score(y_test, model_LR.predict(X_test))))

Accuracy Score of Logistic model is : 0.8268156424581006
Confusion Matrix of Logistic model is : [[95 15]
 [16 53]]
Recall of Logistic model is : 0.7681159420289855
Precision of Logistic model is : 0.7794117647058824


In [45]:
#model coefficients
model_LR.coef_

array([[ 7.81650586e-06, -2.49307794e-02,  2.96493024e-03,
         6.87694671e-01, -5.14981444e-01, -1.01230613e+00,
         1.17542227e+00,  3.78734185e-01, -4.84481717e-01,
         1.41459002e+00,  5.24768134e-01, -1.33632325e+00,
         9.07403321e-01, -4.40763490e-01,  8.45987132e-02,
         2.33569140e-01,  2.72189689e-01,  4.79317191e-01,
         4.57001852e-01,  4.67327840e-01,  1.45345042e-01,
         3.41361619e-01,  7.28313115e-01]])

In [46]:
# you can see that there are 22 coefficients and we had passed 22 parameters
print(len(model_LR.coef_[0]))

23


# 2nd Kaggle Submission

In [49]:
get_submission_file(model_LR, '02_LR.csv')

(418, 23)
