In [None]:
import pandas as pd
import numpy as np
# Comment out the next two lines after uploading data. Only needs to be done once per session!
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('Buying_decisions_data.csv')
X = df[['Gender', 'Age', 'EstimatedSalary']]
y = df['Purchased']
m = X.shape[0]
a = np.ones((m,1))
X.insert(loc = 0, column = 'Ones', value = a)
X.loc[X['Gender'] == 'Male', 'Gender_Male'] = 1         #1 if male
X.loc[X['Gender'] == 'Female', 'Gender_Male'] = 0       #0 if female
del X['Gender']               
#Comment out the lines below if you want to test non-feature scaling runtime and accuracy
age_std = X['Age'].std()
age_ave = X['Age'].mean()
sala_std = X['EstimatedSalary'].std()
sala_ave = X['EstimatedSalary'].mean()
X['Age'] = (X['Age'].subtract(age_ave)).divide(age_std)
X['EstimatedSalary'] =(X['EstimatedSalary'].subtract(sala_ave)).divide(sala_std)
print(X)



Saving Buying_decisions_data.csv to Buying_decisions_data.csv
     Ones       Age  EstimatedSalary  Gender_Male
0     1.0 -1.779569        -1.488183          1.0
1     1.0 -0.253270        -1.458854          1.0
2     1.0 -1.111813        -0.784307          0.0
3     1.0 -1.016419        -0.373714          0.0
4     1.0 -1.779569         0.183521          1.0
..    ...       ...              ...          ...
395   1.0  0.796060        -0.842964          0.0
396   1.0  1.273028        -1.370870          1.0
397   1.0  1.177635        -1.458854          0.0
398   1.0 -0.157877        -1.077589          1.0
399   1.0  1.082241        -0.989604          0.0

[400 rows x 4 columns]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

def sigmoid(x):
 return (1/(1+np.exp(-x)))

def hPredict(theta, X):
  result = [[0 for i in range(len(theta[0]))] for j in range(len(X))]
  return sigmoid(np.matmul(X, theta))

def cost_function(X, y, theta, m):
  y = y.reshape(y.shape[0], 1)
  H = hPredict(theta, X)
  return (sum((y)*np.log(H) + (1-y)*np.log(1-H))) / (-m)

def gradient_descent(theta, X, y, alpha, m):
  H = hPredict(theta, X)
  H = H.reshape((H.shape[0],))
  diff = np.subtract(H, y)
  a = np.matmul(np.transpose(X), diff).reshape((theta.shape[0],1))
  theta = theta - (alpha/m) * a
  return theta

def train(X, y, theta, alfa, m, num_iter):
  for i in range(num_iter):
    theta = gradient_descent(theta, X, y, alfa, m)
    if i % 200== 0:
      print("Cost: ", cost_function(X, y, theta, m))
  return theta

def predict(X, theta, threshold = 0.5):
  a = hPredict(theta, X)
  a [a >= threshold] = 1
  a [a < threshold]  = 0
  return a

def score(y1, y2):
  #y1 is the correct answers
  #y2 is calculated by the model
  y1 = y1.reshape(y1.shape[0], 1)
  y2 = y2.reshape(y2.shape[0], 1)
  y1_not = (1 - y1).reshape(y1.shape[0], 1)
  y2_not = (1 - y2).reshape(y1.shape[0], 1)
  a = np.multiply(y1_not, y2_not) + np.multiply(y1, y2)   
  #1 means  correct prediction, 0 means wrong prediction

  ones_ = np.count_nonzero(a == 1)  #count ones to get the percentage
  return (ones_ / y1.shape[0]) * 100

m = X_train.shape[0]  #number of rows
n = X_train.shape[1]  #number of columns
theta = np.zeros((n, 1))
num_iter = 8000
alpha = 0.1

import timeit

start = timeit.default_timer()
opt_theta = train(X_train, y_train, theta, alpha, m, num_iter)
y_ = predict(X_test, opt_theta)
print("Accuracy: ", score(y_test, y_))
stop = timeit.default_timer()

print('Time: ', stop - start)  

Cost:  [0.67999471]
Cost:  [0.38619149]
Cost:  [0.37740623]
Cost:  [0.37544211]
Cost:  [0.3747799]
Cost:  [0.37451843]
Cost:  [0.37440808]
Cost:  [0.37436009]
Cost:  [0.37433891]
Cost:  [0.37432948]
Cost:  [0.37432527]
Cost:  [0.37432338]
Cost:  [0.37432253]
Cost:  [0.37432215]
Cost:  [0.37432198]
Cost:  [0.3743219]
Cost:  [0.37432187]
Cost:  [0.37432185]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Cost:  [0.37432184]
Accuracy:  88.0
Time:  2.3215680409999777
