#CS460G – Machine Learning
Assignment # 2
Perceptron and Naïve Bayes


1. Implement Perceptron algorithm from scratch (preferably in python) and test your code on a tiny dataset of
your choice. You might want to break down the full algorithm into small functions.```



In [6]:
import numpy as np

def perceptron_fit(X, y, learning_rate=0.01, max_iterations=1000):
    weights = np.zeros(X.shape[1])
    bias = 0

    for _ in range(max_iterations):
        for i in range(X.shape[0]):
            prediction = perceptron_predict(X[i], weights, bias)
            if y[i] * prediction <= 0:
                weights += learning_rate * y[i] * X[i]
                bias += learning_rate * y[i]

    return weights, bias

def perceptron_predict(x, weights, bias):
    return np.sign(np.dot(weights, x) + bias)

def calculate_accuracy(X, y, weights, bias):
    predictions = [perceptron_predict(x, weights, bias) for x in X]
    correct_predictions = sum([1 for i in range(len(y)) if y[i] == predictions[i]])
    return correct_predictions / len(y) * 100

# Sample dataset (AND gate)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([-1, -1, -1, 1])

# Train the Perceptron and get weights and bias
trained_weights, trained_bias = perceptron_fit(X, y, learning_rate=0.1, max_iterations=100)
print("Trained Weights: ", trained_weights)
print("Trained Bias: ", trained_bias)
# Test the Perceptron
test_data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
predictions = [perceptron_predict(x, trained_weights, trained_bias) for x in test_data]
accuracy = calculate_accuracy(X, y, trained_weights, trained_bias)
print("Test Data:")
print(test_data)
print("Predictions:")
print(predictions)
print("Accuracy:", accuracy, "%")

Trained Weights:  [0.2 0.1]
Trained Bias:  -0.20000000000000004
Test Data:
[[0 0]
 [0 1]
 [1 0]
 [1 1]]
Predictions:
[-1.0, -1.0, -1.0, 1.0]
Accuracy: 100.0 %


Implement Naïve Bayes algorithm from scratch (preferably in python) with the explicit assumption about the
dataset that it is drawn from a Gaussian distribution and test your code on a tiny dataset of your choice. You
might want to break down the full algorithm into small functions.

In [32]:
from math import sqrt
from math import pi
from math import exp

def separate_by_class(dataset):
  separated = dict()
  for i in range(len(dataset)):
    vector = dataset[i]
    class_value = vector[-1]
    if (class_value not in separated):
      separated[class_value] = list()
    separated[class_value].append(vector)
  return separated

def mean(numbers):
  return sum(numbers)/float(len(numbers))

def stdev(numbers):
  avg = mean(numbers)
  variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
  return sqrt(variance)

def summarize_dataset(dataset):
  summaries = [(mean(feature), stdev(feature), len(feature))
              for feature in zip(*dataset)]
  del(summaries[-1]) # remove stats for calss label
  return summaries

def summarize_by_class(dataset):
  separated = separate_by_class(dataset)
  summaries = dict()
  for class_value, rows in separated.items():
    summaries[class_value] = summarize_dataset(rows)
  return summaries

def calculate_probability(x, mean, stdev):
  exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
  return (1 / (sqrt(2 * pi) * stdev)) * exponent

def calculate_class_probabilities(summaries, row):
  total_rows = sum([summaries[label][0][2] for label in summaries])
  probabilities = dict()
  for class_value, class_summaries in summaries.items():
    probabilities[class_value] = \
    summaries[class_value][0][2]/float(total_rows)
    for i in range(len(class_summaries)):
      mean, stdev, count = class_summaries[i]
      probabilities[class_value] *= calculate_probability(row[i],mean,stdev)
  return probabilities

def naive_bayes(train, test):
    separated = separate_by_class(train)
    summaries = summarize_by_class(train)
    probabilities = calculate_class_probabilities(summaries, test)
    class_probabilities = {}  # Dictionary to store class probabilities
    for class_value, probability in probabilities.items():
        if class_value not in class_probabilities:
            class_probabilities[class_value] = []
        class_probabilities[class_value].append(probability)
    return class_probabilities



In [33]:
# Test separating data by class
dataset = [[3.393533211,2.331273381,0],
[3.110073483,1.781539638,0],
[1.343808831,3.368360954,0],
[3.582294042,4.67917911,0],
[2.280362439,2.866990263,0],
[7.423436942,4.696522875,1],
[5.745051997,3.533989803,1],
[9.172168622,2.511101045,1],
[7.792783481,3.424088941,1],
[7.939820817,0.791637231,1]]

test_data = [3.393533211, 2.331273381]

separated = separate_by_class(dataset)
for label in separated:
  print(label)
  for row in separated[label]:
    print(row)

summary = summarize_dataset(dataset)
print(summary)

#Let's test our step 3
summary = summarize_by_class(dataset)
for label in summary:
  print(label)
  for row in summary[label]:
    print(row)


# Test Gaussian PDF
print(calculate_probability(1.0, 1.0, 1.0))
print(calculate_probability(2.0, 1.0, 1.0))
print(calculate_probability(0.0, 1.0, 1.0))

#testing point
test = dataset[0] # first training example
probabilities = calculate_class_probabilities(summary, test)
print("The probabilities are: ", probabilities)

predicted_class = naive_bayes(dataset, test_data)
print("Predicted Class:", predicted_class)

class_probabilities = naive_bayes(dataset, test_data)

# Calculate and print the average probabilities for each class
for class_value, probabilities in class_probabilities.items():
    average_probability = sum(probabilities) / len(probabilities)
    print(f"Average Probability for Class {class_value}: {average_probability:.4f}")

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]
[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]
0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)
0.3989422804014327
0.24197072451914337
0.24197072451914337
The probabilities are:  {0: 0.05032427673372076, 1: 0.00011557718379945765}
Predicted Class: {0: [0.05032427673372076], 1: [0.00011557718379945765]}
Average Probability for Class 0: 0.0503
Average Probability for Class 1: 0.0001


3. Adapt your Perceptron implementation for the IRIS (https://archive.ics.uci.edu/ml/datasets/iris) dataset.
Please recall that perceptron algorithm can only perform binary classification where IRIS has 3 classes. So, you
might consider formulating the problem as one vs rest classification and develop multiple classifiers. 50 pts
  1. Split your dataset into 3 sets (training/validation/test) and report your results for Accuracy, Precision,
Recall, F1-score (micro/macro/weighted). Feel free to use any library that provides off-the-shelf
implementation.

In [55]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# 80/10/10
totalSamples = len(X)
trainSamples = int(0.8 * totalSamples)
valSamples = int(0.1 * totalSamples)


XTrain, yTrain = X[:trainSamples], y[:trainSamples]
XVal, yVal = X[trainSamples:trainSamples + valSamples], y[trainSamples:trainSamples + valSamples]
XTest, yTest = X[trainSamples + valSamples:], y[trainSamples + valSamples:]

# Function to perform Perceptron training and prediction
def perceptronFitPredict(XTrain, yTrain, XVal, yVal, learningRate=0.01, maxIterations=1000):
    weights = []
    biases = []
    classes = np.unique(yTrain)

    for c in classes:
        binaryY = np.where(yTrain == c, 1, -1)
        weightsC, biasC = perceptronFit(XTrain, binaryY, learningRate, maxIterations)
        weights.append(weightsC)
        biases.append(biasC)

    # Predict using the trained models
    predictions = []
    for i in range(len(XVal)):
        scores = []
        for c in classes:
            scores.append(perceptronPredict(XVal[i], weights[c], biases[c]))
        predictedClass = classes[np.argmax(scores)]
        predictions.append(predictedClass)

    return predictions

def perceptronFit(X, y, learningRate=0.01, maxIterations=1000):
    weights = np.zeros(X.shape[1])
    bias = 0

    for _ in range(maxIterations):
        for i in range(X.shape[0]):
            prediction = perceptronPredict(X[i], weights, bias)
            if y[i] * prediction <= 0:
                weights += learningRate * y[i] * X[i]
                bias += learningRate * y[i]

    return weights, bias

def perceptronPredict(x, weights, bias):
    return np.sign(np.dot(weights, x) + bias)

# Train and predict using the Perceptron for multi-class classification
predictedVal = perceptronFitPredict(XTrain, yTrain, XVal, yVal, learningRate=0.01, maxIterations=1000)

# Calculate and print the metrics for validation set
accuracy = accuracy_score(yVal, predictedVal)
precisionMicro = precision_score(yVal, predictedVal, average='micro')
precisionMacro = precision_score(yVal, predictedVal, average='macro')
recallMicro = recall_score(yVal, predictedVal, average='micro')
recallMacro = recall_score(yVal, predictedVal, average='macro')
f1Micro = f1_score(yVal, predictedVal, average='micro')
f1Macro = f1_score(yVal, predictedVal, average='macro')

print("Validation Results:")
print("Accuracy:", accuracy)
print("Precision (micro):", precisionMicro)
print("Precision (macro):", precisionMacro)
print("Recall (micro):", recallMicro)
print("Recall (macro):", recallMacro)
print("F1-score (micro):", f1Micro)
print("F1-score (macro):", f1Macro)

# Test the model on the test set
predictedTest = perceptronFitPredict(XTrain, yTrain, XTest, yTest, learningRate=0.01, maxIterations=1000)

# Calculate and print the metrics for the test set
accuracyTest = accuracy_score(yTest, predictedTest)
precisionMicroTest = precision_score(yTest, predictedTest, average='micro')
precisionMacroTest = precision_score(yTest, predictedTest, average='macro')
recallMicroTest = recall_score(yTest, predictedTest, average='micro')
recallMacroTest = recall_score(yTest, predictedTest, average='macro')
f1MicroTest = f1_score(yTest, predictedTest, average='micro')
f1MacroTest = f1_score(yTest, predictedTest, average='macro')

print("\nTest Results:")
print("Accuracy:", accuracyTest)
print("Precision (micro):", precisionMicroTest)
print("Precision (macro):", precisionMacroTest)
print("Recall (micro):", recallMicroTest)
print("Recall (macro):", recallMacroTest)
print("F1-score (micro):", f1MicroTest)
print("F1-score (macro):", f1MacroTest)

  _warn_prf(average, modifier, msg_start, len(result))


Validation Results:
Accuracy: 0.8
Precision (micro): 0.8
Precision (macro): 0.5
Recall (micro): 0.8
Recall (macro): 0.4
F1-score (micro): 0.8000000000000002
F1-score (macro): 0.4444444444444445

Test Results:
Accuracy: 1.0
Precision (micro): 1.0
Precision (macro): 1.0
Recall (micro): 1.0
Recall (macro): 1.0
F1-score (micro): 1.0
F1-score (macro): 1.0


Adapt your Naïve Bayes implementation for the spam
(https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) dataset. 30 pts
1. Use k-fold validation where k=5. Report your results for Accuracy, Precision, Recall, F1-score
(micro/macro/weighted). Feel free to use any library that provides off-the-shelf implementation.


In [39]:
# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
# you have seen below code already


In [40]:
import pandas as pd

spamFile = pd.read_csv('SMSSpamCollection', sep = '\t', header = None, names = ['Label', 'Message'])
print (spamFile.shape)
spamFile.head()


(5572, 2)


Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
spamFile['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [42]:
#want to split 80/10/10
firstSplit = round(len(spamFile) * 0.8)
secondSplit = round(len(spamFile)* 0.10) + firstSplit

trainingData = spamFile[:firstSplit].reset_index(drop=True)
validationData = spamFile[firstSplit:secondSplit].reset_index(drop=True)
print(trainingData.shape)
print(validationData.shape)

trainingData['Label'].value_counts(normalize=True)
trainingData['Label'].value_counts(normalize=True)

#get rid of punctuation
trainingData['Message'] = trainingData['Message'].str.replace(
   '\W', ' ') # Removes punctuation
trainingData['Message'] = trainingData['Message'].str.lower()
trainingData.head(3)


(4458, 2)
(557, 2)


  trainingData['Message'] = trainingData['Message'].str.replace(


Unnamed: 0,Label,Message
0,ham,go until jurong point crazy available only ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...


In [43]:
trainingData['Message'] = trainingData['Message'].str.split()
vocabulary = []
for sms in trainingData['Message']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))
print (vocabulary)



In [44]:
print("the length of our vocab is:", len(vocabulary))

the length of our vocab is: 7813


In [45]:
wordCountPerMessage = {uniqueWord: [0] * len(trainingData['Message']) for uniqueWord in vocabulary}

for index, sms in enumerate(trainingData['Message']):
   for word in sms:
      wordCountPerMessage[word][index] += 1

wordCounts = pd.DataFrame(wordCountPerMessage)
wordCountsClean = pd.concat([trainingData, wordCounts], axis=1)
wordCountsClean.head()

Unnamed: 0,Label,Message,kittum,remind,trade,corporation,sharing,baig,evening,bribe,...,ploughing,digi,dammit,dontmatter,leadership,trackmarque,08000839402,end,str,randy
0,ham,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# Isolating spam and ham messages first
spamMessages = wordCountsClean[wordCountsClean['Label'] == 'spam']
hamMessages = wordCountsClean[wordCountsClean['Label'] == 'ham']

# P(Spam) and P(Ham)
pSpam = len(spamMessages) / len(wordCountsClean)
pHam = len(hamMessages) / len(wordCountsClean)

# N_Spam
nWordsSpamMessage = spamMessages['Message'].apply(len)
nSpam = nWordsSpamMessage.sum()

# N_Ham
nWordsHamMessage = hamMessages['Message'].apply(len)
nHam = nWordsHamMessage.sum()

# N_Vocabulary
nVocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spamMessages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (nSpam + alpha*nVocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = hamMessages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (nHam + alpha*nVocabulary)
   parameters_ham[word] = p_word_given_ham

In [52]:
import re

def classify(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = pSpam
   p_ham_given_message = pHam

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal probability ')

In [53]:
#testing
classify("You won a new phone!!! Congrats")
classify("I got the email you sent, looks great")

P(Spam|message): 8.024304510719332e-17
P(Ham|message): 6.79845452530546e-18
Label: Spam
P(Spam|message): 2.5430745037618846e-27
P(Ham|message): 7.546719346356956e-22
Label: Ham


In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nFolds = 5
data = validationData.reset_index(drop=True)  # Reset the index for easier splitting

accuracies = []
precisions = []
recalls = []
f1Micros = []
f1Macros = []
f1Weighteds = []

foldSize = len(data) // nFolds

for fold in range(nFolds):
    startIdx = fold * foldSize
    endIdx = (fold + 1) * foldSize if fold < nFolds - 1 else len(data)

    testData = data[startIdx:endIdx].copy()  # Make a copy to avoid the SettingWithCopyWarning
    trainData = data.drop(index=testData.index).copy()  # Make a copy of the training data

    # Train your classifier on trainData and make predictions on testData
    trainData['predicted'] = trainData['Message'].apply(classify_test_set)
    testData['predicted'] = testData['Message'].apply(classify_test_set)

    precision = precision_score(testData['Label'], testData['predicted'], average='weighted')
    recall = recall_score(testData['Label'], testData['predicted'], average='weighted')
    accuracy = accuracy_score(testData['Label'], testData['predicted'])
    f1Micro = f1_score(testData['Label'], testData['predicted'], average='micro')
    f1Macro = f1_score(testData['Label'], testData['predicted'], average='macro')
    f1Weighted = f1_score(testData['Label'], testData['predicted'], average='weighted')

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1Micros.append(f1Micro)
    f1Macros.append(f1Macro)
    f1Weighteds.append(f1Weighted)

averagePrecision = sum(precisions) / nFolds
averageRecall = sum(recalls) / nFolds
averageF1Micro = sum(f1Micros) / nFolds
averageF1Macro = sum(f1Macros) / nFolds
averageF1Weighted = sum(f1Weighteds) / nFolds
averageAccuracy = sum(accuracies) / nFolds

print("The 5 accuracies are: ", accuracies)
print('Average Accuracy: ', averageAccuracy)
print('Average Precision:', averagePrecision)
print('Average Recall:', averageRecall)
print('Average F1 Micro:', averageF1Micro)
print('Average F1 Macro:', averageF1Macro)
print('Average F1 Weighted:', averageF1Weighted)


The 5 accuracies are:  [0.972972972972973, 0.9819819819819819, 0.990990990990991, 0.9819819819819819, 0.9911504424778761]
Average Accuracy:  0.9838156740811608
Average Precision: 0.9838278336046544
Average Recall: 0.9838156740811608
Average F1 Micro: 0.9838156740811608
Average F1 Macro: 0.9619170594734122
Average F1 Weighted: 0.9837078744444578
