In [41]:
# Question 3
import numpy as np
import matplotlib.pyplot as plt

class LR_miniSGD:
  def __init__(self, learn_rate = 0.01, batch_size = 30, max_iterat = 1000, weights = None):
    self.learn_rate = learn_rate
    self.batch_size = batch_size
    self.max_iterat = max_iterat
    self.weights = weights

  #sigmoid function
  def sigmoid(self, z):
    return 1 / (1 + np.exp(-z))

  #negative log-likelihood
  def loss(self, X, y):
    m = X.shape[0]
    predict = self.sigmoid(np.dot(X, self.weights))
    loss = -np.mean(y*np.log(predict + (1e-10)) + (1-y)*np.log(1-predict + (1e-10)))
    return loss

  # training model based on mini-batch SGD
  def fit(self, X, y):
    features = X.shape[1]
    if self.weights is None:
      self.weights = np.random.randn(features)

    #shuffling data before each epoch
    for iteration in range(self.max_iterat):
      indices = np.arange(X.shape[0])
      np.random.shuffle(indices)
      X = X[indices]
      y = y[indices]

    for start in range(0, X.shape[0], self.batch_size):
      end = start + self.batch_size
      X_batch = X[start:end]
      y_batch = y[start:end]

      predict = self.sigmoid(np.dot(X_batch, self.weights))
      gradient = np.dot(X_batch.T, (predict - y_batch)) / X_batch.shape[0]

      self.weights -= self.learn_rate *gradient

    #predicting the prob estimates for binary class
  def predict_prob(self, X):
    return self.sigmoid(np.dot(X, self.weights))

    #predict binary class labels

  def predict(self, X, threshold = 0.5):
    prob = self.predict_prob(X)
    return (prob >= threshold).astype(int)



In [21]:
# Question 4

# part a
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(return_X_y=True)
X, y = data
df = pd.DataFrame(X, columns = load_breast_cancer().feature_names)
df['Diagnostics'] = y
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Diagnostics
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [22]:
# part b
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df, test_size=0.25, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.4, random_state=42)

In [25]:
# part c
train_data['Diagnostics'].value_counts()
# for train data Diagnostic class 1 count is 268 and class 0 is 158
val_data['Diagnostics'].value_counts()
# for validation data Diagnostic class 1 count is 52 and class 0 is 33



Unnamed: 0_level_0,count
Diagnostics,Unnamed: 1_level_1
1,52
0,33


In [80]:
# part d
from sklearn.preprocessing import StandardScaler

X_train = train_data.drop(columns = ['Diagnostics']).to_numpy()
y_train = train_data['Diagnostics'].to_numpy()

X_val = val_data.drop(columns = ['Diagnostics']).to_numpy()
y_val = val_data['Diagnostics'].to_numpy()

X_test = test_data.drop(columns = ['Diagnostics']).to_numpy()
y_test = test_data['Diagnostics'].to_numpy()

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

model = LR_miniSGD(learn_rate = 0.0001, batch_size = 50, max_iterat = 1000)
model.fit(X_train_scaled, y_train)

In [81]:
from sklearn.metrics import accuracy_score

y_val_pred = model.predict(X_val_scaled)
test_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy

0.7529411764705882

The accuracy seems to improve with a smaller learning rate or a larger batch size. The batch size also cannot be too large or the model performs worse.

In [86]:
# part e

def acc_score(y_true, y_pred):
  return np.mean(y_true == y_pred)

def prec_score(y_true, y_pred):
  tru_pos = np.sum((y_true == 1) & (y_pred == 1))
  fal_pos = np.sum((y_true == 0) & (y_pred == 1))
  return tru_pos / (tru_pos + fal_pos) if (tru_pos + fal_pos) > 0 else 0.0

def recall_score(y_true, y_pred):
  tru_pos = np.sum((y_true == 1) & (y_pred == 1))
  fal_neg = np.sum((y_true == 1) & (y_pred == 0))
  return tru_pos / (tru_pos + fal_neg) if (tru_pos + fal_neg) > 0 else 0.0

def f1_score(y_true, y_pred):
  prec = prec_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  return 2 * (prec * recall) / (prec + recall) if (prec + recall) > 0 else 0.0

y_test_pred = model.predict(X_test_scaled)

test_accuracy = acc_score(y_test, y_test_pred)

test_precision = prec_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(test_precision)
print(test_recall)
print(test_f1)
print(test_accuracy)

0.8571428571428571
0.8108108108108109
0.8333333333333334
0.7931034482758621


Part f
The model had a low false positive rate. It predicted class 1 out of all the times 85.71% of the time. There was alos a high recall value, suggesting a higher false negative rate which may not be good. This can be modified based on circumstances and if we would prefer a higher false negative or false positive. The F1 score is relatively close to 1 indiciating the model is performing well. The accuracy is also showing that most times there was correct classification.