In [552]:
# Cell 1: Imports

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# from google.colab import files
# uploaded = files.upload()
english = [i.strip() for i in open('english.txt') if i.strip()]
dutch = [i.strip() for i in open('dutch.txt') if i.strip()]

--2025-04-30 04:51:38--  https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘spambase.data.86’

spambase.data.86        [  <=>               ] 686.47K  1.69MB/s    in 0.4s    

2025-04-30 04:51:38 (1.69 MB/s) - ‘spambase.data.86’ saved [702942]



In [553]:
# Cell 2: Perceptron

# influenced by lecture 07, slide 27 pseudocode
class Perceptron:
    def __init__(self, max_iter=10):
        self.max_iter = max_iter #iterations
        self.w = None # weights
        self.b = 0 # bias

    def train(self, D, Y):
        n, d = D.shape
        self.w = np.zeros(d) # w ← 0 for all d
        self.b = 0 # b ← 0

        for iter in range(1, self.max_iter + 1):
            for i in range(n): # for all (x, y) in D
                x, y = D[i], Y[i]
                a = np.dot(self.w, x) + self.b # activation
                if y * a <= 0: # if y*a is <= 0 then update weights and bias
                    self.w += y * x
                    self.b += y

    def predict(self, D): # returns trained model
        return np.sign(np.dot(D, self.w) + self.b)

In [554]:
# Cell 3: spambase data

data = np.loadtxt('spambase.data', delimiter=',')

# split into features (X) and labels (y)
X = data[:, :-1]
y = np.where(data[:, -1] == 0, -1, 1)  # convert 0 to -1

# 80% training set for D, and rest is dev and test (10% each)
D_80, D_20, Y_80, Y_20 = train_test_split(X, y, test_size =0.2) # could add a state for consistent results

# 10% dev, 10% test sets
D_dev, D_test, Y_dev, Y_test = train_test_split(D_20, Y_20, test_size=0.5)

# prints shapes
print("training set:", D_80.shape)
print("development set:", D_dev.shape)
print("test set:", D_test.shape)

training set: (3680, 57)
development set: (460, 57)
test set: (461, 57)


In [555]:
# Cell 4: training and results for spambase

# train perceptron model
p_spambase = Perceptron(max_iter=100)
p_spambase.train(D_80, Y_80)
# give the predictions from test set
p_results = p_spambase.predict(D_test)
print("Perceptron Accuracy:", round(np.mean(p_results == Y_test) * 100, 2), "%")
print("Perceptron Confusion Matrix:\n", confusion_matrix(Y_test, p_results))
print("\n")

# train logistic regression model
LR_spambase = LogisticRegression(max_iter=1000)
LR_spambase.fit(D_80, Y_80)
# give the predictions from test set
LR_results = LR_spambase.predict(D_test)
# print results
print("Logistic Regression Accuracy:", round(np.mean(LR_results == Y_test) * 100, 2), "%")
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(Y_test, LR_results))
print("\n")

# train linear SVC model
SVC_spambase = LinearSVC(max_iter=1000)
SVC_spambase.fit(D_80, Y_80)
# give the predictions from test set
SVC_results = SVC_spambase.predict(D_test)
# print results
print("Linear SVC Accuracy:", round(np.mean(SVC_results == Y_test) * 100, 2), "%")
print("Linear SVC Confusion Matrix:\n", confusion_matrix(Y_test, SVC_results))
print("\n")

Perceptron Accuracy: 83.51 %
Perceptron Confusion Matrix:
 [[235  30]
 [ 46 150]]


Logistic Regression Accuracy: 92.62 %
Logistic Regression Confusion Matrix:
 [[255  10]
 [ 24 172]]


Linear SVC Accuracy: 91.97 %
Linear SVC Confusion Matrix:
 [[254  11]
 [ 26 170]]




STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [556]:
# Cell 5: features for language

# influenced by document suggestions and class
def feats(s):
    v = np.zeros(12)
    s = s.lower()
    v[0:3] = [s.count(x) for x in 'eao']
    v[3:5] = [s.count(' '), len(s)]
    v[5:10] = ['the ' in s, 'de ' in s or 'het ' in s, 'ij' in s or 'sch' in s,
        s.endswith('ing'), s.endswith('lijk')]
    v[10] = sum(s.count(x) for x in 'ëéèäöü')
    v[11] = any(w in s for w in ['freedom', 'rights', 'vrijheid', 'rechten'])
    return v.astype(int)

# change them to vectors
X_eng = np.array([feats(s) for s in english])
X_dut = np.array([feats(s) for s in dutch])

# make labels for english and dutch, 1, -1
y_eng = np.ones(len(X_eng))
y_dut = -np.ones(len(X_dut))
X = np.vstack((X_eng, X_dut))
y = np.concatenate((y_eng, y_dut))

# 80% training set for D, and rest is dev and test (10% each)
D_80, D_20, Y_80, Y_20 = train_test_split(X, y, test_size =0.2) # could add a state for consistent results

# 10% dev, 10% test sets
D_dev, D_test, Y_dev, Y_test = train_test_split(D_20, Y_20, test_size=0.5)

# prints shapes
print("training set:", D_80.shape)
print("development set:", D_dev.shape)
print("test set:", D_test.shape)

training set: (110, 12)
development set: (14, 12)
test set: (14, 12)


In [557]:
# Cell 6: training and results of language files

# train perceptron model
p_language = Perceptron(max_iter=100)
p_language.train(D_80, Y_80)
p_results = p_language.predict(D_test)
print("Perceptron Language Accuracy:", round(np.mean(p_results == Y_test) * 100, 2), "%")
print("Perceptron Confusion Matrix:\n", confusion_matrix(Y_test, p_results))
print("\n")

# train logistic regression model
LR_language = LogisticRegression(max_iter=1000)
LR_language.fit(D_80, Y_80)
LR_results = LR_language.predict(D_test)
print("Logistic Regression Accuracy:", round(np.mean(LR_results == Y_test) * 100, 2), "%")
print("Logistic Confusion Matrix:\n", confusion_matrix(Y_test, LR_results))
print("\n")

# train linear SVC model
SVC_language = LinearSVC(max_iter=1000)
SVC_language.fit(D_80, Y_80)
SVC_results = SVC_language.predict(D_test)
print("SVC Language Accuracy:", round(np.mean(SVC_results == Y_test) * 100, 2), "%")
print("SVC Confusion Matrix:\n", confusion_matrix(Y_test, SVC_results))
print("\n")



Perceptron Language Accuracy: 92.86 %
Perceptron Confusion Matrix:
 [[7 0]
 [1 6]]


Logistic Regression Accuracy: 100.0 %
Logistic Confusion Matrix:
 [[7 0]
 [0 7]]


SVC Language Accuracy: 100.0 %
SVC Confusion Matrix:
 [[7 0]
 [0 7]]




In [558]:
# Cell 7: Evaluate models on new sentences

# new sentences, 40 in each as I need 20 per set
# from randomwordgenerator.com/sentence.php (and translate for dutch)
new_english = [
    "Jerry liked to look at paintings while eating garlic ice cream.",
    "I'm not a party animal, but I do like animal parties.",
    "He shaved the peach to prove a point.", "He excelled at firing people nicely.",
    "We will not allow you to bring your pet armadillo along.",
    "I trust everything that's written in purple ink.",
    "Joe made the sugar cookies; Susan decorated them.",
    "There's an art to getting your way, and spitting olive pits across the table isn't it.",
    "The best key lime pie is still up for debate.",
    "Some bathing suits just shouldn’t be worn by some people.",
    "This is the last random sentence I will be writing and I am going to stop mid-sent",
    "The complicated school homework left the parents trying to help their kids quite confused.",
    "I was starting to worry that my pet turtle could tell what I was thinking.",
    "The pet shop stocks everything you need to keep your anaconda happy.",
    "Thirty years later, she still thought it was okay to put the toilet paper roll under rather than over.",
    "As she walked along the street and looked in the gutter, she realized facemasks had become the new cigarette butts.",
    "It took him a while to realize that everything he decided not to change, he was actually choosing.",
    "Being unacquainted with the chief raccoon was harming his prospects for promotion.",
    "He would only survive if he kept the fire going and he could hear thunder in the distance.",
    "The gloves protect my feet from excess work.", "He was sitting in a trash can with high street class.",
    "Mothers spend months of their lives waiting on their children.",
    "Her daily goal was to improve on yesterday.", "The near-death experience brought new ideas to light.",
    "Henry couldn't decide if he was an auto mechanic or a priest.",
    "Grape jelly was leaking out the hole in the roof.",
    "He set out for a short walk, but now all he could see were mangroves and water were for miles.",
    "He wondered why at 18 he was old enough to go to war, but not old enough to buy cigarettes.",
    "He enjoys practicing his ballet in the bathroom.",
    "There was no telling what thoughts would come from the machine.",
    "They finished building the road they knew no one would ever use.",
    "It would have been a better night if the guys next to us weren't in the splash zone.",
    "There's a message for you if you look up.", "Iguanas were falling out of the trees.",
    "They ran around the corner to find that they had traveled back in time.",
    "There were a lot of paintings of monkeys waving bamboo sticks in the gallery.",
    "He barked orders at his daughters but they just stared back with amusement.",
    "8% of 25 is the same as 25% of 8 and one of them is much easier to do in your head.",
    "The secret ingredient to his wonderful life was crime.",
    "25 years later, she still regretted that specific moment."
]


new_dutch = [
    "Liefde is niet zoals pizza.", "Dit boek zal zeker je brein vloeibaar maken.",
    "Ze stond erop dat het opruimen van je kast de sleutel was tot goed rijden.",
    "Hij besloot al het zand op het strand te tellen als hobby.",
    "Ze was verdrietig te horen dat vuurvliegjes met uitsterven bedreigd worden door kunstlicht, verlies van habitat en pesticiden.",
    "Hij hield ervan zijn bananen in hotdogbroodjes te eten.",
    "Patricia houdt van het geluid van nagels die stevig tegen het schoolbord worden gedrukt.",
    "Waarheid in reclame en dinosaurussen met skateboards hebben veel gemeen.",
    "Hij was het enige lid van de club die geen pruimenpudding lustte.",
    "Hij vond regen fascinerend maar onaangenaam.",
    "Terwijl hij aan het touw bungelde diep in de kloof...",
    "De vleugels van de kolibrie vervaagden terwijl hij gretig de suikerwater uit de voeder dronk.",
    "Het is niet moeilijk om een handstand te doen als je gewoon op je handen staat.",
    "Je vindt niet vaak een zompige banaan op straat.",
    "Kevin omarmde zijn vermogen om op de verkeerde plaats op het verkeerde moment te zijn.",
    "Ze spreekt altijd met hem in een luide stem.",
    "We zijn nog nooit in Azië geweest, noch hebben we Afrika bezocht.",
    "Eindelijk...",
    "Ze zag geen ironie in mij vragen te veranderen maar wilde dat ik haar accepteerde zoals ze is.",
    "Erin creëerde per ongeluk een nieuw universum.",
    "Het bord zei dat er wegwerkzaamheden waren, dus besloot hij te versnellen.",
    "Het zou een betere nacht zijn geweest als de jongens naast ons niet in de spatzone zaten.",
    "Combineer je designer cowboyhoed met duikuitrusting voor een gedenkwaardige gelegenheid.",
    "Terwijl hij uit het raam keek, zag hij een clown voorbijlopen.",
    "Mijn moeder probeert cool te zijn door te zeggen dat ze van dezelfde dingen houdt als ik.",
    "Soms staar ik naar een deur of een muur en vraag me af wat deze realiteit is, waarom ik leef en waar dit allemaal over gaat.",
    "De ironie van de situatie ging niet aan iemand in de kamer voorbij.",
    "Ik woonde vroeger in de vijver van mijn buurman, maar het esthetische beviel me niet.",
    "Eerlijk gezegd gaf ik niet veel om het eerste seizoen, dus heb ik het tweede niet gekeken.",
    "Pat bestelde een spookpepertaart.",
    "Ze hadden dringend een andere drummer nodig, aangezien de huidige alleen bongo's kon spelen.",
    "Jason leefde zijn leven volgens het motto: 'Alles wat de moeite waard is om te doen, is de moeite waard om slecht te doen.'",
    "Blauw klonk destijds te koud, maar het leek te werken voor gin.",
    "Ik denk dat ik de rode auto zal kopen, of ik lease de blauwe.",
    "Hoewel hij dacht dat de wereld plat was, zag hij de ironie niet in van de wens om de wereld rond te reizen.",
    "Ze zeggen dat mensen belangrijke momenten in hun leven goed herinneren, toch herinnert niemand zich zijn eigen geboorte.",
    "Ik ben blij je donatie aan te nemen; elk bedrag wordt zeer gewaardeerd.",
    "Hij droeg het chirurgisch masker in het openbaar niet om een virus te vermijden, maar om mensen van zich weg te houden.",
    "Hij had per ongeluk ingebroken op de server van zijn bedrijf.",
    "Ondanks meerdere complicaties en haar bijna-doodervaring..."
]

# development set, first 20 English + 20 Dutch
devX = np.array([feats(s) for s in (new_english[:20] + new_dutch[:20])])
devY = np.concatenate((np.ones(20), -np.ones(20)))

# test set, next 20 English + 20 Dutch
testX = np.array([feats(s) for s in (new_english[20:] + new_dutch[20:])])
testY = np.concatenate((np.ones(20), -np.ones(20)))

# Evaluate on dev set
print("DEVELOPMENT SETS")
print("Perceptron Development Accuracy:", round(np.mean(p_language.predict(devX) == devY) * 100, 2), "%")
print("Perceptron Confusion Matrix:\n", confusion_matrix(devY, p_language.predict(devX)))
print("LogReg Dev Accuracy:", round(np.mean(LR_language.predict(devX) == devY) * 100, 2), "%")
print("LogReg Confusion Matrix:\n", confusion_matrix(devY, LR_language.predict(devX)))
print("SVC Dev Accuracy:", round(np.mean(SVC_language.predict(devX) == devY) * 100, 2), "%")
print("SVC Confusion Matrix:\n", confusion_matrix(devY, SVC_language.predict(devX)))

# Evaluate on test set
print("\nTEST SETS")
print("Perceptron Test Accuracy:", round(np.mean(p_language.predict(testX) == testY) * 100, 2), "%")
print("Perceptron Confusion Matrix:\n", confusion_matrix(testY, p_language.predict(testX)))
print("LogReg Test Accuracy:", round(np.mean(LR_language.predict(testX) == testY) * 100, 2), "%")
print("LogReg Confusion Matrix:\n", confusion_matrix(testY, LR_language.predict(testX)))
print("SVC Test Accuracy:", round(np.mean(SVC_language.predict(testX) == testY) * 100, 2), "%")
print("SVC Confusion Matrix:\n", confusion_matrix(testY, SVC_language.predict(testX)))


DEVELOPMENT SETS
Perceptron Development Accuracy: 77.5 %
Perceptron Confusion Matrix:
 [[18  2]
 [ 7 13]]
LogReg Dev Accuracy: 87.5 %
LogReg Confusion Matrix:
 [[17  3]
 [ 2 18]]
SVC Dev Accuracy: 90.0 %
SVC Confusion Matrix:
 [[18  2]
 [ 2 18]]

TEST SETS
Perceptron Test Accuracy: 80.0 %
Perceptron Confusion Matrix:
 [[19  1]
 [ 7 13]]
LogReg Test Accuracy: 90.0 %
LogReg Confusion Matrix:
 [[18  2]
 [ 2 18]]
SVC Test Accuracy: 92.5 %
SVC Confusion Matrix:
 [[18  2]
 [ 1 19]]


In [559]:
readme = """
Some Linear Classification Models
By: Amir Noori

Overview:
This homework uses different classification models being a perceptron, logistic regression, and linear SVC to test their accuracies with two different databases. The databases being a spam email checklist and a language model that differentiates english and dutch with added features.

Key Features:
First there is a perceptron implementation.
Then a spambase dataset is loaded and split with the differing classification models tested against each other.
This process is repeated with the language model and once again with personally implemented data of sentences.
This is done through training the data under the classification models and printing and tracking their results.

Files:
- `english.txt` / `dutch.txt`: Source text files
- `spambase.data`: Public UCI dataset for spam classification

To run this:
1. Open the Colab notebook.
2. Run each cell from top to bottom (or all at once using the Runtime --> Run All shortcut).
3. Ensure the files are uploaded properly and are in the same filepaths if running locally.
  - If there is an error there are commented lines in cell 1 that can be uncommented and run to upload the files and fix the issue.
4. View resulting model accuracy and confusion matrices for both tasks.

"""
with open("README.txt", "w") as f:
    f.write(readme)
