In [0]:
# For cancer dataset
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split 

In [0]:
def sigmoid(X, coeff):
  dot = np.dot(X, coeff)
  return 1.0/(1 + np.exp(-dot))

In [0]:
# Feature scaling
def mean(a):
  mean = float(np.mean(a))
  func = np.vectorize(lambda t: t - mean)
  return func(a)

def std_var(a):
  std = float(np.std(a))
  func = np.vectorize(lambda t: t / std)
  return func(a)

def feature_scaling(X):
  X = np.apply_along_axis(mean, 0, X)
  X = np.apply_along_axis(std_var, 0, X)
  return X

In [0]:
def normalizeX(X):
#   Change as per notation to get: w0x0 + w1x1 + w2x2 + ...
# i.e. add an extra feature vector x0 for bias
# Also do feature scaling
  
  shape = (X.shape[0], 1)
  
  newX = feature_scaling(X)
  newX = np.hstack((np.ones(shape), newX))
  
  return newX
  

In [0]:
#@title Default title text
# To find theta: Gradient descent
def gradient(X, y, theta):
  m = y.shape[0]
  sig = sigmoid(X, theta) 
  loss = sig - y
  grad =  1/m * np.dot(X.T, loss)
  return grad 

def cost_function(X, y, theta):
  sig = sigmoid(X, theta) 
  c1 = y * np.log(sig) 
  c2 = (1 - y) * np.log(1 - sig) 
  final = -c1 - c2 
  me = np.mean(final)
  return me

def gradient_descent(X, y, coeff, learning_rate = 0.01, min_cost_change = 0.001, max_epochs = 10000):
  epoch = 1 
  current_cost = cost_function(X, y, coeff) 

  while epoch < max_epochs:
    prev_cost = current_cost
    grad = gradient(X, y, coeff)
    coeff = coeff - (learning_rate * grad) 
    current_cost = cost_function(X, y, coeff) 
    cost_change = prev_cost - current_cost
#     print("cost_change:", cost_change)
    epoch += 1
  
  return coeff, epoch 

In [0]:
def predict_y(X, coeff): 
    pred_prob = sigmoid(X, coeff)
    pred_value = np.where(pred_prob >= 0.5, 1, 0) 
    return pred_value

In [0]:
if __name__ == "__main__":
  dataset = datasets.load_breast_cancer()
  X = dataset.data 
  y = dataset.target
  
#   dataset = datasets.load_iris()
#   X = dataset.data
#   y = dataset.target
  
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 
  
  print("X_train.shape: ", X_train.shape)
  print("y_train.shape: ", y_train.shape)
  
  X_train = normalizeX(X_train)
  y_train = y_train.reshape((-1,1))
  y_test = y_test.reshape((-1,1))
  print("New X_train.shape after normalization: ", X_train.shape)
  print("New y_train.shape after normalization: ", y_train.shape)
  
# #   Initial values
#   coeff = np.zeros((X_train.shape[1], 1))
#   coeff, epoch = gradient_descent(X_train, y_train, coeff)
#   print("\ncoeff.shape: ", coeff.shape)
#   print("coefficients: ", coeff)
#   print("epoch: ",epoch)
  

X_train.shape:  (398, 30)
y_train.shape:  (398,)
New X_train.shape after normalization:  (398, 31)
New y_train.shape after normalization:  (398, 1)


In [0]:
X_test = normalizeX(X_test)
y_pred = predict_y(X_test, coeff).reshape((-1,1))
# assert(y_pred.shape == (341,1))


In [0]:
y_true = y_test
def true_pos(y, y_pred):
    return np.sum((y == 1) & (y_pred == 1))

def true_neg(y, y_pred):
    return np.sum((y == 0) & (y_pred == 0))
  
accuracy = (true_pos(y_true, y_pred) + true_neg(y_true, y_pred))/len(y_true)
print("Accuracy:", accuracy)

Accuracy: 0.9590643274853801


In [0]:
from sklearn.linear_model import LogisticRegression

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size=0.4) 
model = LogisticRegression(solver = 'lbfgs', max_iter=10000)
model.fit(X, y)
y2_pred = model.predict(X2_train)
accuracy2 = (true_pos(y2_train, y2_pred) + true_neg(y2_train, y2_pred))/len(y2_train)
print(accuracy2)

0.9648093841642229
