# 2 Single 2 Perceptron

## BEFORE RUNNING PERCEPTRON
## Run commands in terminal shell:
1. pip install gensim
2. pip install nltk
3. pip install kaggle

In [1]:
# Package imports

from src.utils.recorder_util import ModelResults
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tqdm import tqdm
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import kagglehub

In [2]:
# Initializing recorder results

model_name = "2_Single_2Perceptron_Word2Vec"
author = "Noah Sher"
seed = 123
recorder = ModelResults(model_name, author, seed)

In [3]:
# Loading datasets

train_data = pd.read_csv("testData/testData.csv")
test_data = pd.read_csv("testData/testData.csv")
#test_data = pd.read_csv("CHATGPT_test.csv")

In [4]:
print("Unprocessed train_data:")
train_data.head()

Unprocessed train_data:


Unnamed: 0,label,source,text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ..."
1,1,1,I believe zoos are very good for the world. Th...
2,1,1,I believe that the former British Prime Minis...
3,1,1,Limiting car usage has numerous advantages th...
4,1,1,"Okay, so libaries are like books and stuf but ..."


In [5]:
print("Unprocessed test_data:")
test_data.head()

Unprocessed test_data:


Unnamed: 0,label,source,text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ..."
1,1,1,I believe zoos are very good for the world. Th...
2,1,1,I believe that the former British Prime Minis...
3,1,1,Limiting car usage has numerous advantages th...
4,1,1,"Okay, so libaries are like books and stuf but ..."


In [6]:
# Preprocessing text by:
#   1. lowercasing all letters and
#   2. splitting all strings into a list of words.

def preprocess_data(text):
    lower_text = text.lower()
    preprocessed = lower_text.split()
    return preprocessed

train_data["preprocessed_text"] = train_data["text"].apply(preprocess_data)
test_data["preprocessed_text"] = test_data["text"].apply(preprocess_data)

In [7]:
print("Preprocessed train_data:")
train_data.head()

Preprocessed train_data:


Unnamed: 0,label,source,text,preprocessed_text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ...","[hey,, mrs., johnson!, here's, my, essay, on, ..."
1,1,1,I believe zoos are very good for the world. Th...,"[i, believe, zoos, are, very, good, for, the, ..."
2,1,1,I believe that the former British Prime Minis...,"[i, believe, that, the, former, british, prime..."
3,1,1,Limiting car usage has numerous advantages th...,"[limiting, car, usage, has, numerous, advantag..."
4,1,1,"Okay, so libaries are like books and stuf but ...","[okay,, so, libaries, are, like, books, and, s..."


In [8]:
print("Preprocessed test_data:")
test_data.head()

Preprocessed test_data:


Unnamed: 0,label,source,text,preprocessed_text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ...","[hey,, mrs., johnson!, here's, my, essay, on, ..."
1,1,1,I believe zoos are very good for the world. Th...,"[i, believe, zoos, are, very, good, for, the, ..."
2,1,1,I believe that the former British Prime Minis...,"[i, believe, that, the, former, british, prime..."
3,1,1,Limiting car usage has numerous advantages th...,"[limiting, car, usage, has, numerous, advantag..."
4,1,1,"Okay, so libaries are like books and stuf but ...","[okay,, so, libaries, are, like, books, and, s..."


In [9]:
# Training Word2Vec on the train_data

path = "GoogleModel/GoogleNews-vectors-negative300.bin"
word2Vec = KeyedVectors.load_word2vec_format(path, binary = True)
print("Google's Word2Vec model loaded successfully.")

Google's Word2Vec model loaded successfully.


In [10]:
# Converting texts to embeddings

def embeddings(text, model):
    vectors = []
    for word in text:
        if word in model:
            vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis = 0)
    else:
        return np.zeros(model.vector_size)
        
x_train = []
for text in train_data["preprocessed_text"]:
    embedding = embeddings(text, word2Vec)
    x_train.append(embedding)
x_train = np.array(x_train)
y_train = train_data["label"].values

x_test = []
for text in test_data["preprocessed_text"]:
    embedding = embeddings(text, word2Vec)
    x_test.append(embedding)
x_test = np.array(x_test)
y_test = test_data["label"].values

In [11]:
# Scaling the data

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [12]:
# initializing weights and bias

np.random.seed(seed)
n_examples, n_features = x_train_scaled.shape
w = np.zeros(n_features)
b = 0

In [13]:
# Training the perceptron

n_epochs = 10
indices = np.arange(n_examples)
recorder.record_training_start()
for epoch in range(n_epochs):
    n_errors = 0
    np.random.shuffle(indices)
    for i in tqdm(indices, desc = f'epoch {epoch + 1}'):
        x = x_train_scaled[i]
        y_true = y_train[i]
        score = x @ w + b
        y_pred = 1 if score > 0 else 0
        if y_true != y_pred:
            if y_true == 1:
                w += x
                b += 1
            else:
                w -= x
                b -= 1
            n_errors += 1
    print(f"Epoch {epoch + 1}: Errors = {n_errors}")
    if n_errors == 0:
        print("No errors, early stopping.")
        break
recorder.record_training_stop()

epoch 1: 100%|██████████| 6870/6870 [00:00<00:00, 405580.45it/s]


Epoch 1: Errors = 616


epoch 2: 100%|██████████| 6870/6870 [00:00<00:00, 421257.69it/s]


Epoch 2: Errors = 438


epoch 3: 100%|██████████| 6870/6870 [00:00<00:00, 434122.31it/s]


Epoch 3: Errors = 364


epoch 4: 100%|██████████| 6870/6870 [00:00<00:00, 437915.93it/s]


Epoch 4: Errors = 370


epoch 5: 100%|██████████| 6870/6870 [00:00<00:00, 436661.69it/s]


Epoch 5: Errors = 316


epoch 6: 100%|██████████| 6870/6870 [00:00<00:00, 438182.31it/s]


Epoch 6: Errors = 349


epoch 7: 100%|██████████| 6870/6870 [00:00<00:00, 432883.17it/s]


Epoch 7: Errors = 323


epoch 8: 100%|██████████| 6870/6870 [00:00<00:00, 437729.67it/s]


Epoch 8: Errors = 323


epoch 9: 100%|██████████| 6870/6870 [00:00<00:00, 432909.19it/s]


Epoch 9: Errors = 319


epoch 10: 100%|██████████| 6870/6870 [00:00<00:00, 428696.99it/s]

Epoch 10: Errors = 307





In [14]:
# Testing the perceptron

recorder.record_testing_start()
y_pred = (np.dot(x_test_scaled, w) + b) > 0
recorder.record_testing_stop()

In [15]:
# Recording results

report = classification_report(y_test, 
                               y_pred, 
                               target_names = ["Not Bot", "Bot"])
print(report)
recorder.record_performance(y_true = y_test,
                            y_pred = y_pred,
                            target_names = ["Not Bot", "Bot"])
recorder.write("model_results.csv")

              precision    recall  f1-score   support

     Not Bot       0.97      0.95      0.96      3435
         Bot       0.95      0.97      0.96      3435

    accuracy                           0.96      6870
   macro avg       0.96      0.96      0.96      6870
weighted avg       0.96      0.96      0.96      6870

