In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

def conf_matrix(y,pred):
  m = np.zeros(4)
  for i in range(len(y)):
    if y[i]==0 and pred[i]==0:
      m[0] += 1
    if y[i]==0 and pred[i]==1:
      m[1] += 1
    if y[i]==1 and pred[i]==0:
      m[2] += 1
    if y[i]==1 and pred[i]==1:
      m[3] += 1
  return m

In [2]:
data = pd.read_excel("/content/drive/MyDrive/NNFL Assignments (Aug 2021)/Assignment 1/data_q4_q5.xlsx")
data = data.sample(frac=1) #print(data.head())
y = np.array(data['diagnosis'], ndmin=1).T 
y = np.where(y=='M',1,0) # class labels
data.pop('diagnosis')
data.insert(0, "x0", pd.Series(np.ones(len(y)))) # appending ones
x = np.array(data) # feature matrix

In [3]:
m = len(y)
nf = 5 #number of folds
x_subsets = np.array_split(x, nf)
y_subsets = np.array_split(y, nf)

In [4]:
# LRT Classifier
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
np.seterr(divide='ignore', invalid='ignore')

accuracy_vals = []
sensitivity_vals = []
specificity_vals = []

for fold in range(nf):
  # test-train split
  x_test = x_subsets[fold]
  y_test = y_subsets[fold]

  x_train = np.concatenate(np.delete(x_subsets, fold, 0), axis=0)
  y_train = np.concatenate(np.delete(y_subsets, fold, 0), axis=0)

  m_train = len(y_train)
  m_test = len(y_test)

  # normalizing input data
  pp = np.amax(np.abs(x_train), axis=0)
  x_train = x_train/pp
  x_test = x_test/pp

  # data for likelihood
  x_class0_train = x_train[np.where(y_train==0)]
  x_class1_train = x_train[np.where(y_train==1)]

  y_class0_train = y_train[np.where(y_train==0)]
  y_class1_train = y_train[np.where(y_train==1)]

  # likelihood modeled by multivariate gaussian
  pdfXgiven0 = multivariate_normal.pdf(x_test, np.mean(x_class0_train, axis=0), np.cov(x_class0_train, rowvar=False), allow_singular=True)
  pdfXgiven1 = multivariate_normal.pdf(x_test, np.mean(x_class1_train, axis=0), np.cov(x_class1_train, rowvar=False), allow_singular=True)

  pofy0 = len(y_class0_train)/m_train
  pofy1 = len(y_class1_train)/m_train
  
  ratio = pofy1/pofy0
  delta = pdfXgiven0/pdfXgiven1
  
  # prediction
  pred = []
  for i in range(m_test):
    if delta[i]>ratio:
      pred.append(0)
    else:
      pred.append(1)

  # performance measures
  tn,fp,fn,tp = conf_matrix(y_test, pred)
  sensitivity = tp/(tp+fn)
  specificity = tn/(tn+fp)
  accuracy = (tp+tn)/(tp+tn+fp+fn)

  accuracy_vals.append(accuracy)
  sensitivity_vals.append(sensitivity)
  specificity_vals.append(specificity)

print("mean accuracy of LRT classifier = {}".format(np.mean(accuracy_vals)))
print("mean sensitivity of LRT classifier = {}".format(np.mean(sensitivity_vals)))
print("mean specificity of LRT classifier = {}".format(np.mean(specificity_vals)))

mean accuracy of LRT classifier = 0.9595404440304302
mean sensitivity of LRT classifier = 0.9575546165933542
mean specificity of LRT classifier = 0.962544061302682
