# Logistic Regression

##### Andrew Cachia, Nov 2018

In [135]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

## Data Exploration and Normalisation

In [151]:
titanic_data_csv = pd.read_csv('Titanic Dataset/train.csv')
titanic_data_csv.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [149]:
def transform_data(dataset):
    transformed_dataset = dataset[['Survived','Pclass','Sex','SibSp','Parch','Cabin']]
    
    ## Replace female and male with binary values
    transformed_dataset['Sex'].replace(['female','male'],[0,1],inplace=True)

    ## Rather than cabin numbers, determine instead whether the person had a cabin or not
    transformed_dataset.loc[~transformed_dataset['Cabin'].isnull(),'Cabin'] = 1 
    transformed_dataset.loc[transformed_dataset['Cabin'].isnull(),'Cabin'] = 0

    ## Apply mean normalization to age and fair between 1 and 0
    transformed_dataset[['Age', 'Fare']] = dataset[['Age', 'Fare']].apply(lambda x: (x - x.min()) / x.std())

    ## Split embarking into 3 binary columns
    transformed_dataset[['Embarked_C', 'Embarked_Q', 'Embarked_S']] = pd.get_dummies(dataset['Embarked'])

    transformed_dataset=transformed_dataset.dropna()
    
    return transformed_dataset


titanic_data = transform_data(titanic_data_csv)
titanic_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Cabin,Age,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,1,0,0,1.485561,0.145895,0,0,1
1,1,1,0,1,0,1,2.586997,1.434461,1,0,0
2,1,3,0,0,0,0,1.76092,0.159478,0,0,1
3,1,1,0,1,0,1,2.380477,1.068552,0,0,1
4,0,3,1,0,0,0,2.380477,0.161993,0,0,1
6,0,1,1,0,0,1,3.688432,1.043649,0,0,1
7,0,3,1,3,1,0,0.108767,0.4241,0,0,1
8,1,3,0,0,2,0,1.82976,0.22404,0,0,1
9,1,2,0,1,0,0,0.934843,0.605126,1,0,0
10,1,3,0,1,1,1,0.246446,0.336061,0,0,1


## Gradient Descent Algorithm

In [138]:
Y = pd.to_numeric(titanic_data['Survived'])
X = titanic_data.loc[:, titanic_data.columns != 'Survived']
Q = np.zeros(X.shape[1])

alpha = 0.1
error_threshold = 0.5

In [139]:
def sigmoid(X, Q):
    Z = np.dot(X, Q)
    return 1 / (1 + np.exp(-Z))

$$ -ylog(\hat y) - ((1-y)log(1-\hat y)) $$

In [140]:
def cost_function(actual_y, predicted_y):
    return -np.dot(actual_y,np.log(predicted_y)) - (np.dot((1-actual_y),np.log(1-predicted_y))).mean()

$$ (\hat y - y)x $$

In [141]:
def gradient(predicted_y, actual_y, x):
    return np.dot((predicted_y - actual_y),x) / actual_y.shape[0]

In [142]:
count=0

while count < 30000:
    count += 1
    Predicted = sigmoid(X,Q)
    cost = cost_function(Y,Predicted)
    error = gradient(Predicted, Y, X)
    Q -= alpha * error

In [143]:
## Resulting parameter weights after training
Q

array([-0.78433948, -2.62045702, -0.34068938, -0.08215087,  0.78802917,
       -0.52375614,  0.10166118,  4.22821864,  3.3544208 ,  3.86370876])

### Comparing with sklearn toolkit implementation

In [144]:
model = LogisticRegression(C=1e20)

model.fit(X, Y)

model.coef_



array([[-1.01353688, -2.64606728, -0.36291884, -0.06678579,  0.5444916 ,
        -0.63375472,  0.04116372, -4.1201468 , -4.94374704, -4.49740869]])

## Testing

In [150]:
titanic_test_data_csv = pd.read_csv('Titanic Dataset/train.csv')
titanic_test_data = transform_data(titanic_test_data_csv)

test_parameters = titanic_test_data.loc[:, titanic_data.columns != 'Survived']
expected = titanic_test_data['Survived']

actual = sigmoid(test_parameters,Q)

result = pd.DataFrame(np.vstack((expected.values, actual))).T

result.head(20)

Unnamed: 0,0,1
0,0.0,0.098527
1,1.0,0.935962
2,1.0,0.64671
3,1.0,0.91595
4,0.0,0.087857
5,0.0,0.359209
6,0.0,0.097278
7,1.0,0.601284
8,1.0,0.86884
9,1.0,0.855816


### Statistics

In [146]:
actual[actual >= 0.5] = 1
actual[actual < 0.5] = 0

accuracy = np.sum(expected == actual) / actual.shape[0] * 100
conf_matrix = confusion_matrix(expected, actual)
precision = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1]) * 100

print("Accuracy: {0:.2f}%".format(accuracy))
print("Precision: {0:.2f}%".format(precision))

Accuracy: 79.69%
Precision: 85.38%


In [147]:
scikit_predicted = model.predict(test_parameters)

scikit_predicted[scikit_predicted >= 0.5] = 1
scikit_predicted[scikit_predicted < 0.5] = 0

accuracy = np.sum(expected == scikit_predicted) / scikit_predicted.shape[0] * 100
conf_matrix = confusion_matrix(expected, scikit_predicted)
precision = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1]) * 100

print("Scikit Accuracy: {0:.2f}%".format(accuracy))
print("Scikit Precision: {0:.2f}%".format(precision))

Scikit Accuracy: 80.25%
Scikit Precision: 85.61%
