In [1]:
import data_analysis_tools as da

data = da.file_ops.read_csv('./files/Social_Network_Ads.csv', as_array=True)
headers = data.pop(0)
headers.pop(0)

train, test = da.random.split_data(data, test_proportion=0.2)

train_features = [(1.0 if i[1] == 'Male' else 0.0, float(i[2]), float(i[3])) for i in train]
train_labels = [float(i[4]) for i in train]

test_features = [(1.0 if i[1] == 'Male' else 0.0, float(i[2]), float(i[3])) for i in test]
test_labels = [float(i[4]) for i in test]

print('Headers:', headers)
print(train_features[:2], train_labels[:2])

Headers: ['Gender', 'Age', 'EstimatedSalary', 'Purchased']
[(0.0, 38.0, 80000.0), (0.0, 41.0, 51000.0)] [0.0, 0.0]


In [2]:
import numpy as np
rescaled_train_features = da.rescale(np.array(train_features))
rescaled_test_features = da.rescale(np.array(test_features))

In [3]:
linear_regression_classifier = da.ml.MultiLinearRegressionClassifier(alpha=0.001)
linear_regression_classifier.train(rescaled_train_features, train_labels)
linear_regression_classifier.beta

[-0.0015992382379250768, 0.27131125397294176, 0.14078417118638878]

In [4]:
test_index = 4

print(test_labels[test_index], linear_regression_classifier.predict(rescaled_test_features[test_index]))

1.0 0.08210378182544134


In [5]:
treshold = 0
predicted = [1 if linear_regression_classifier.predict(test_i) > treshold else 0 for test_i in rescaled_test_features]

counter = 0
for predicted_i, actual_i in zip(predicted, test_labels):
    if predicted_i == actual_i:
        counter += 1

accuracy = counter / len(predicted)
print('With Multi Linear Regression | Accuracy for treshold', treshold, 'is:', accuracy)

With Multi Linear Regression | Accuracy for treshold 0 is: 0.8125


In [6]:
# logistic regression
import math
# the logistic function
def logistic(x):
    return 1.0 / (1 + math.exp(-x))
        

def logistic_prime(x):
    return logistic(x) * (1 - logistic(x))

def logistic_log_likelihood_i(xi, yi, beta):
    if yi == 1:
        return math.log(logistic(da.dot_product(xi, beta)))
    else:
        return math.log(1.0 - logistic(da.dot_product(xi, beta)))
        

def logistic_log_likelihood(x, y, beta):
    return sum([logistic_log_likelihood_i(xi, yi, beta) for xi, yi in zip(x, y)])

def logistic_log_partial_ij(xi, yi, beta, j):
    return (yi - logistic(da.dot_product(xi, beta))) * xi[j]

def logistic_log_gradient_i(xi, yi, beta):
    return [logistic_log_partial_ij(xi, yi, beta, j) for j in range(len(beta))]

def logistic_log_gradient(x, y, beta):
    return [logistic_log_gradient_i(xi, yi, beta) for xi, yi in zip(x, y)]


In [7]:
# applying the model
from functools import partial

da.r.seed(0)
fn = partial(logistic_log_likelihood, rescaled_train_features, train_labels)
gradient_fn = partial(logistic_log_likelihood, train_features, train_labels)
beta_0 = [da.random.random() for _ in range(len(train_features[0]))]

# beta_hat = da.maximize_batch(target_fn=fn, gradient_fn=gradient_fn, theta_0=beta_0, tolerance=0.01)
beta_hat = da.maximize_stochastic(logistic_log_likelihood_i, logistic_log_gradient_i, rescaled_train_features, train_labels, beta_0)
beta_hat

[0.035363317434165854, 1.996122825446562, 1.158748732539555]

In [8]:
def predict(xi, beta_hat):
    return logistic(da.dot_product(beta_hat, xi))

treshold = 0.67
predicted = [1 if predict(test_i, beta_hat) > treshold else 0 for test_i in rescaled_test_features]

counter = 0
for predicted_i, actual_i in zip(predicted, test_labels):
    if predicted_i == actual_i:
        counter += 1

accuracy = counter / len(predicted)
print('With Logistic Regression | Accuracy for treshold', treshold, 'is:', accuracy)

With Logistic Regression | Accuracy for treshold 0.67 is: 0.8


In [9]:
# goodness of fit

def test(x, y, treshold=0.5):
    """returns (tp, fp, fn, tn)"""
    tp = fp = tn = fn = 0
    for xi, yi in zip(x, y):
        predicted = predict(xi, beta_hat)

        if yi == 1:
            if predicted >= treshold:
                tp += 1
            else:
                fn += 1
        else:
            if predicted >= treshold:
                fp += 1
            else:
                tn += 1
    
    return tp, fp, fn, tn

tp, fp, fn, tn = test(rescaled_test_features, test_labels, 0.67)

print('Accuracy:', da.ml.accuracy(tp, fp, fn, tn))
print('Precision:', da.ml.precision(tp, fp, fn, tn))
print('Recall:', da.ml.recall(tp, fp, fn, tn))
print('F1-Score:', da.ml.f1_score(tp, fp, fn, tn))

Accuracy: 0.8
Precision: 0.7307692307692307
Recall: 0.6785714285714286
F1-Score: 0.7037037037037038


In [10]:
classifier = da.ml.LogisticRegressionClassifier()

classifier.train(rescaled_train_features, train_labels)

classifier.beta_hat

[0.03632142578079057, 2.0005713203441586, 1.160171728602713]

In [11]:
classifier.test(rescaled_test_features, test_labels, 0.67)

(19, 7, 9, 45)

In [12]:
# Support Vector Machines
