In [None]:
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
import math

class NaiveClassifer:
  # here we define the attributes of the class
  traincsv = ""
  testcsv = ""
  traincols = []
  yprobs = []
  condprob0 = {}
  condprob1 = {}
  

  def __init__(self, traincsv, testcsv, traincols):
    # initialize each object given the three parameters and make sure to reset values
    # call the train function to populate the two dicts and list w the conditional probabilites
    self.traincsv = traincsv
    self.testcsv = testcsv
    self.traincols = traincols
    self.condprob0 = {}
    self.condprob1 = {}
    self.yprobs = []
    self.train()

  def train(self):
    train = pd.read_csv(self.traincsv)

    #calculate the approx values of P(Y=0) and P(Y=1)
    py_0 =train['Label'].value_counts()[0]/len(train)
    py_1 =train['Label'].value_counts()[1]/len(train)

    print(f'The probability for P(Y=1) is {py_1}')

    self.yprobs= [py_0,py_1]

    #calculate the conditional probabilties using the MAP estimate, and save in the dictionaries
    for col in self.traincols:
      px0_y0 = (len(train[(train[col] == 0) & (train["Label"] == 0)]) + 1) / (train['Label'].value_counts()[0] + 2)
      px1_y0 = (len(train[(train[col] == 1) & (train["Label"] == 0)]) + 1) / (train['Label'].value_counts()[0] + 2)

      px0_y1 = (len(train[(train[col] == 0) & (train["Label"] == 1)]) + 1) / (train['Label'].value_counts()[1] + 2)
      px1_y1 = (len(train[(train[col] == 1) & (train["Label"] == 1)]) + 1) / (train['Label'].value_counts()[1] + 2)
      
      self.condprob0[col] = [px0_y0,px1_y0]
      self.condprob1[col] = [px0_y1,px1_y1]

  def predict(self):
    test = pd.read_csv(self.testcsv)
    predictions = []

    #using the values in the dictionary, add the log transformed conditional probabilties to find which
    # y value has the higher probability, and choose this y-value as the prediction
    for i in range(0, len(test)):
      p_y0 = math.log(self.yprobs[0])
      p_y1 = math.log(self.yprobs[1])
      for col in self.traincols:
        if col in test.columns:
          if test[col][i]== 0:
            p_y0 += math.log(self.condprob0[col][0])
            p_y1 += math.log(self.condprob1[col][0])
          if test[col][i] == 1:
            p_y0 += math.log(self.condprob0[col][1])
            p_y1 += math.log(self.condprob1[col][1])

      if p_y0 > p_y1 :
        predictions.append(0)
      else:
        predictions.append(1)
    
    test["prediction"] = predictions

    #print confusion matrix and accuracy
    cm = confusion_matrix(test["Label"], predictions)

    print(cm)
    print(f'Accuracy is {sklearn.metrics.accuracy_score(test["Label"], predictions)}')

    #calculating parity:
    d0 = (len(test[(test["Demographic"] == 0) & (test["prediction"] == 1)]) + 1) / (len(test[test["Demographic"] == 0]) +2)
    d1 = (len(test[(test["Demographic"] == 1) & (test["prediction"] == 1)]) + 1) / (len(test[test["Demographic"] == 1]) +2)

    print(f"Parity Calculation: When D=0: {d0}. When D=1: {d1}.")

    sum0 = 0
    sum1 = 0
    for i in range(len(test)):
      if test["prediction"][i] == test["Label"][i] and test["Demographic"][i] == 0:
        sum0 += 1
      elif test["prediction"][i] == test["Label"][i] and test["Demographic"][i] == 1:
        sum1 += 1

    #calculating calibration:
    c0 =  (sum0 + 1) / (len(test[test["Demographic"] == 0]) +2)
    c1 =  (sum1 + 1) / (len(test[test["Demographic"] == 1]) +2)

    print(f"Calibration Calculation: When D=0: {c0}. When D=1: {c1}.")

    return test

In [None]:
simpletest = pd.read_csv("/content/simple-test.csv")
simpletrain = pd.read_csv("/content/simple-train.csv")

simpleclassifier = NaiveClassifer("/content/simple-train.csv", "/content/simple-test.csv", ['x1', 'x2'])

The probability for P(Y=1) is 0.5


In [None]:
predictions = simpleclassifier.predict()

In [None]:
ancestry = pd.read_csv("/content/ancestry-train.csv")
ancestrytest = pd.read_csv("/content/ancestry-test.csv")
ancestrytest.columns = ancestry.columns
ancestrytest.to_csv("/content/newancestry.csv")

In [None]:
ancestry = NaiveClassifer("/content/ancestry-train.csv", "/content/newancestry.csv", ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
       'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16',
       'col17', 'col18', 'col19'])
results = ancestry.predict()

In [None]:
heart = NaiveClassifer("/content/heart-train.csv", "/content/heart-test.csv",['A.1', 'A.2', 'A.3', 'A.4', 'B.1', 'B.2', 'B.3', 'B.4', 'B.5', 'C.1',
       'C.2', 'C.3', 'C.4', 'C.5', 'D.1', 'D.2', 'D.3', 'D.4', 'E.1', 'E.2',
       'E.3', 'E.4'])
heartpredict = heart.predict()

The probability for P(Y=1) is 0.5
[[ 10   5]
 [ 42 130]]
Accuracy is 0.7486631016042781
Parity Calculation: When D=0: 0.5106382978723404. When D=1: 0.7847222222222222.
Calibration Calculation: When D=0: 0.5957446808510638. When D=1: 0.7916666666666666.


In [None]:
netflixclassifier = NaiveClassifer("/content/netflix-small-train.csv", "/content/netflix-test.csv", ['3 Idiots', 'Bourne Identity', 'Bruce Almighty', 'Forest Gump',
       'How to Lose a Guy in 10 Days', 'I Robot', 'Independence Day',
       'La Vita E Bella', 'Lord of the Rings', 'Oceans 11', 'Patriot',
       'Pearl Harbor', 'Pirates', 'Pulp Fiction', 'Rat Race', 'Shrek',
       'Star Wars', 'What Women Want', 'When Harry Met Sally'])
netflixpredict = netflixclassifier.predict()

The probability for P(Y=1) is 0.59
[[265   0]
 [234   1]]
Accuracy is 0.532
Parity Calculation: When D=0: 0.003787878787878788. When D=1: 0.008333333333333333.
Calibration Calculation: When D=0: 0.5492424242424242. When D=1: 0.5125.


In [None]:
netflixclassifier.condprob0

{'3 Idiots': [16, 0.6046511627906976],
 'Bourne Identity': [10, 0.7441860465116279],
 'Bruce Almighty': [18, 0.5581395348837209],
 'Forest Gump': [16, 0.6046511627906976],
 'How to Lose a Guy in 10 Days': [21, 0.4883720930232558],
 'I Robot': [15, 0.627906976744186],
 'Independence Day': [19, 0.5348837209302325],
 'La Vita E Bella': [15, 0.627906976744186],
 'Lord of the Rings': [6, 0.8372093023255814],
 'Oceans 11': [9, 0.7674418604651163],
 'Patriot': [13, 0.6744186046511628],
 'Pearl Harbor': [16, 0.6046511627906976],
 'Pirates': [7, 0.813953488372093],
 'Pulp Fiction': [14, 0.6511627906976745],
 'Rat Race': [23, 0.4418604651162791],
 'Shrek': [20, 0.5116279069767442],
 'Star Wars': [8, 0.7906976744186046],
 'What Women Want': [23, 0.4418604651162791],
 'When Harry Met Sally': [31, 0.2558139534883721]}

In [None]:
def getpsetanswers(classifer):
  train = pd.read_csv(classifer.traincsv)
  print(f'The estimate for P(Y=1) is {classifer.yprobs[1]}.')

  print("Values of P(X_i=1|Y=1) for all values of i:")
  for key in classifer.condprob1:
    print(f'      {key}: {classifer.condprob1[key][1]}')

  relprobs = {}
  for key in classifer.condprob1:
    relprobs[key] = (classifer.condprob1[key][1]*len(train[train[key] == 0]))/(classifer.condprob1[key][0]*len(train[train[key] == 1]))
  
  print("The top three indicative features were:")
  sortdict = sorted(relprobs, key=relprobs.get)
  print(sortdict[-3:])
  print(relprobs)


In [None]:
getpsetanswers(heart)

The estimate for P(Y=1) is 0.5.
Values of P(X_i=1|Y=1) for all values of i:
      A.1: 0.4523809523809524
      A.2: 0.2619047619047619
      A.3: 0.38095238095238093
      A.4: 0.3333333333333333
      B.1: 0.35714285714285715
      B.2: 0.19047619047619047
      B.3: 0.40476190476190477
      B.4: 0.4523809523809524
      B.5: 0.2857142857142857
      C.1: 0.38095238095238093
      C.2: 0.3333333333333333
      C.3: 0.38095238095238093
      C.4: 0.5714285714285714
      C.5: 0.30952380952380953
      D.1: 0.14285714285714285
      D.2: 0.3333333333333333
      D.3: 0.21428571428571427
      D.4: 0.16666666666666666
      E.1: 0.2619047619047619
      E.2: 0.3333333333333333
      E.3: 0.38095238095238093
      E.4: 0.47619047619047616
The top three indicative features were:
['D.3', 'D.4', 'C.4']
{'A.1': 1.4527736131934033, 'A.2': 1.8287841191066998, 'A.3': 1.7289377289377288, 'A.4': 1.8529411764705885, 'B.1': 1.296296296296296, 'B.2': 1.6470588235294117, 'B.3': 1.9104761904761904, '

In [None]:
import math
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
import pandas as pd

class LogisticRegression:
  parameters = []
  traincsv = ""
  testcsv = ""
  cols = []
  lr = 0
  N = 1000
  accuracy = 0

  # initialize the parameters, and add the col of 1s
  def __init__(self, train, test, col, lr, iters):
    self.parameters = [0]*(len(col)+1)
    self.traincsv = train
    self.testcsv = test

    self.cols = col
    self.cols.append("theta_naught")

    self.accuracy = 0
    self.lr = lr
    self.N = iters
    self.train()

  #sigmoid function for later use
  def sigmoid(self,x):
    val = 1/(1+math.exp(-x))
    return val

  def train(self):
    traindf = pd.read_csv(self.traincsv)

    # add a column of 1s to make calculations for theta_0 more straightforward
    traindf["theta_naught"] = [1]*len(traindf)
    traindfcut = traindf[self.cols] 
    
    # set up iterations
    for i in range(0,self.N):

      #initialize gradient with all zeros at each iteration
      gradient = [0]*len(self.parameters)
      
      # update the gradient for each row and for each feature in each row
      for j in range(0,len(traindf)):
        for k in range(0,len(self.cols)):
          x  = np.dot(traindfcut.iloc[j], self.parameters)
          gradient[k] += traindf.loc[j, self.cols[k]]*(traindf.loc[j,"Label"]-self.sigmoid(x))
      
      # update the parameters given the gradient calculated above
      self.parameters = [x + self.lr*y for x, y in zip(self.parameters, gradient)]


  def predict(self):
      test = pd.read_csv(self.testcsv)
      test["theta_naught"] = [1]*len(test)
      testcut = test[self.cols] 

      predictions = []

      # using the previously calculated parameters, calcualte P(Y=y|x) using the sigmoid function
      for i in range(0, len(test)):
        
        result = self.sigmoid(np.dot(testcut.iloc[i], self.parameters))

        # if prob is less than 0.5, val is 0 and if greater than 0.5, val is 1
        if result < 0.5 :
          predictions.append(0)
        else:
          predictions.append(1)
      
      test["prediction"] = predictions

      #print confusion matrix and accuracy
      cm = confusion_matrix(test["Label"], predictions)

      print(cm)
      print(f'Accuracy is {sklearn.metrics.accuracy_score(test["Label"], predictions)}')
      self.accuracy = sklearn.metrics.accuracy_score(test["Label"], predictions)
      return test

In [None]:
simpleclassifier = LogisticRegression("/content/simple-train.csv", "/content/simple-test.csv", ['x1', 'x2'], 0.0001, 1000)

In [None]:
result = simpleclassifier.predict()

[[2 0]
 [0 2]]
Accuracy is 1.0


In [None]:
simpleclassifier.parameters

[0.09759347169504434, -0.0011685590390117962, -0.002356868930802323]

In [None]:
netflixclassifier = LogisticRegression("/content/netflix-small-train.csv", "/content/netflix-test.csv", ['3 Idiots', 'Bourne Identity', 'Bruce Almighty', 'Forest Gump',
       'How to Lose a Guy in 10 Days', 'I Robot', 'Independence Day',
       'La Vita E Bella', 'Lord of the Rings', 'Oceans 11', 'Patriot',
       'Pearl Harbor', 'Pirates', 'Pulp Fiction', 'Rat Race', 'Shrek',
       'Star Wars', 'What Women Want', 'When Harry Met Sally'], 0.00625, 100)

In [None]:
netflixresults = netflixclassifier.predict()

[[139 126]
 [ 43 192]]
Accuracy is 0.662


In [None]:
def newpsetans(logistic):
  print("The three most indicative features were:")

  newlist = zip(logistic.cols[:-1],logistic.parameters[:-1])

  dictt = {}
  for key, value in newlist:
    dictt[key] = value
    
  sortdict = sorted(dictt, key=dictt.get)
  print(sortdict[-3:])

  train = pd.read_csv(logistic.traincsv)
  train["theta_naught"] = [1]*len(train)
  traincut = train[logistic.cols] 
  empty = [0]* len(logistic.cols)


  ll0 = 0

  for i in range(len(traincut)):
    ll0 += train.loc[i,"Label"] * np.log((logistic.sigmoid(np.dot(empty,traincut.iloc[i])))) + (1-train.loc[i,"Label"])*np.log(1-(logistic.sigmoid(np.dot(empty,traincut.iloc[i]))))

  lla = 0

  for i in range(len(traincut)):
    lla += train.loc[i,"Label"] * np.log((logistic.sigmoid(np.dot(logistic.parameters,traincut.iloc[i])))) + (1-train.loc[i,"Label"])*np.log(1-(logistic.sigmoid(np.dot(logistic.parameters,traincut.iloc[i]))))

  print(f'Log Liklihood with parameters of 0: {ll0}.')
  print(f'Log Liklihood with trained parameters: {lla}.')



In [None]:
newpsetans(netflixclassifier)

The three most indicative features were:
['Rat Race', 'How to Lose a Guy in 10 Days', 'When Harry Met Sally']
Log Liklihood with parameters of 0: -69.31471805599459.
Log Liklihood with trained parameters: -44.53409947084053.


In [None]:
# testing out different learning rates and how they fare
for i in range(0,5):
  lr  = 0.1 / 10**i
  heart = LogisticRegression("/content/heart-train.csv", "/content/heart-test.csv",['A.1', 'A.2', 'A.3', 'A.4', 'B.1', 'B.2', 'B.3', 'B.4', 'B.5', 'C.1',
       'C.2', 'C.3', 'C.4', 'C.5', 'D.1', 'D.2', 'D.3', 'D.4', 'E.1', 'E.2',
       'E.3', 'E.4'],lr,1000)
  heartpredict = heart.predict()
  print(f'Learning Rate is {lr}, accuracy is {heart.accuracy}.')

[[ 13   2]
 [ 57 115]]
Accuracy is 0.6844919786096256
Learning Rate is 0.1, accuracy is 0.6844919786096256.
[[ 13   2]
 [ 52 120]]
Accuracy is 0.7112299465240641
Learning Rate is 0.01, accuracy is 0.7112299465240641.
[[ 12   3]
 [ 39 133]]
Accuracy is 0.7754010695187166
Learning Rate is 0.001, accuracy is 0.7754010695187166.
[[  9   6]
 [ 29 143]]
Accuracy is 0.8128342245989305
Learning Rate is 0.0001, accuracy is 0.8128342245989305.
[[  7   8]
 [  4 168]]
Accuracy is 0.9358288770053476
Learning Rate is 1e-05, accuracy is 0.9358288770053476.


In [None]:
ancestry = LogisticRegression("/content/ancestry-train.csv", "/content/newancestry.csv", ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
       'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16',
       'col17', 'col18', 'col19'],0.0001,1000)
results = ancestry.predict()

[[98 11]
 [31 44]]
Accuracy is 0.7717391304347826


In [None]:
newpsetans(ancestry)

The three most indicative features were:
['col12', 'col6', 'col10']
Log Liklihood with parameters of 0: -196.1606520984649.
Log Liklihood with trained parameters: -105.42099699171574.
