# 2 Single 2 Perceptron

In [1]:
!pip install gensim
!pip install nltk
!pip install kagglehub



In [2]:
# Package imports

from src.utils.recorder_util import ModelResults
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tqdm import tqdm
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import kagglehub
import os

In [3]:
# Initializing recorder results

model_name = "2_Single_2Perceptron_Word2Vec"
author = "Noah Sher"
seed = 123
recorder = ModelResults(model_name, author, seed)

In [4]:
# Loading datasets

train_data = pd.read_csv("trainData/trainData.csv")
test_data = pd.read_csv("testData/testData.csv")
#test_data = pd.read_csv("CHATGPT_test.csv")

In [5]:
print("Unprocessed train_data:")
train_data.head()

Unprocessed train_data:


Unnamed: 0,label,source,text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...
1,0,1,The system of the Electoral College is a widel...
2,1,1,The renowned British statesman Winston Churchi...
3,0,1,"My grandfather would always say ""creativity is..."
4,1,1,In my pursuit to become an assistant manager a...


In [6]:
print("Unprocessed test_data:")
test_data.head()

Unprocessed test_data:


Unnamed: 0,label,source,text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ..."
1,1,1,I believe zoos are very good for the world. Th...
2,1,1,I believe that the former British Prime Minis...
3,1,1,Limiting car usage has numerous advantages th...
4,1,1,"Okay, so libaries are like books and stuf but ..."


In [7]:
# Preprocessing text by:
#   1. lowercasing all letters and
#   2. splitting all strings into a list of words.

def preprocess_data(text):
    lower_text = text.lower()
    preprocessed = lower_text.split()
    return preprocessed

train_data["preprocessed_text"] = train_data["text"].apply(preprocess_data)
test_data["preprocessed_text"] = test_data["text"].apply(preprocess_data)

In [8]:
print("Preprocessed train_data:")
train_data.head()

Preprocessed train_data:


Unnamed: 0,label,source,text,preprocessed_text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...,"[help, wanted!, the, seagoing, cowboys, progra..."
1,0,1,The system of the Electoral College is a widel...,"[the, system, of, the, electoral, college, is,..."
2,1,1,The renowned British statesman Winston Churchi...,"[the, renowned, british, statesman, winston, c..."
3,0,1,"My grandfather would always say ""creativity is...","[my, grandfather, would, always, say, ""creativ..."
4,1,1,In my pursuit to become an assistant manager a...,"[in, my, pursuit, to, become, an, assistant, m..."


In [9]:
print("Preprocessed test_data:")
test_data.head()

Preprocessed test_data:


Unnamed: 0,label,source,text,preprocessed_text
0,1,1,"Hey, Mrs. Johnson! Here's my essay on whether ...","[hey,, mrs., johnson!, here's, my, essay, on, ..."
1,1,1,I believe zoos are very good for the world. Th...,"[i, believe, zoos, are, very, good, for, the, ..."
2,1,1,I believe that the former British Prime Minis...,"[i, believe, that, the, former, british, prime..."
3,1,1,Limiting car usage has numerous advantages th...,"[limiting, car, usage, has, numerous, advantag..."
4,1,1,"Okay, so libaries are like books and stuf but ...","[okay,, so, libaries, are, like, books, and, s..."


In [10]:
# Downloading pretrained Word2Vec model

dataset = kagglehub.dataset_download("adarshsng/googlenewsvectors")
path = os.path.join(dataset, "GoogleNews-vectors-negative300.bin")
word2Vec = KeyedVectors.load_word2vec_format(path, 
                                             binary = True)
print("Google's Word2Vec model loaded successfully.")

Google's Word2Vec model loaded successfully.


In [11]:
# Converting texts to embeddings

def embeddings(text, model):
    vectors = []
    for word in text:
        if word in model:
            vectors.append(model[word])
    if vectors:
        return np.mean(vectors, 
                       axis = 0)
    else:
        return np.zeros(model.vector_size)
        
x_train = []
for text in train_data["preprocessed_text"]:
    embedding = embeddings(text, 
                           word2Vec)
    x_train.append(embedding)
x_train = np.array(x_train)
y_train = train_data["label"].values

x_test = []
for text in test_data["preprocessed_text"]:
    embedding = embeddings(text, 
                           word2Vec)
    x_test.append(embedding)
x_test = np.array(x_test)
y_test = test_data["label"].values

In [12]:
# Scaling the data

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
# initializing weights and bias

np.random.seed(seed)
n_examples, n_features = x_train_scaled.shape
w = np.zeros(n_features)
b = 0

In [14]:
# Training the perceptron

n_epochs = 20
indices = np.arange(n_examples)
recorder.record_training_start()
for epoch in range(n_epochs):
    np.random.shuffle(indices)
    for i in tqdm(indices, desc = f'epoch {epoch + 1}'):
        x = x_train_scaled[i]
        y_true = y_train[i]
        score = x @ w + b
        y_pred = 1 if score > 0 else 0
        if y_true != y_pred:
            if y_true == 1:
                w += x
                b += 1
            else:
                w -= x
                b -= 1
recorder.record_training_stop()

epoch 1: 100%|██████████| 36997/36997 [00:00<00:00, 455351.97it/s]
epoch 2: 100%|██████████| 36997/36997 [00:00<00:00, 532530.29it/s]
epoch 3: 100%|██████████| 36997/36997 [00:00<00:00, 543882.21it/s]
epoch 4: 100%|██████████| 36997/36997 [00:00<00:00, 550775.76it/s]
epoch 5: 100%|██████████| 36997/36997 [00:00<00:00, 551798.11it/s]
epoch 6: 100%|██████████| 36997/36997 [00:00<00:00, 552849.87it/s]
epoch 7: 100%|██████████| 36997/36997 [00:00<00:00, 551129.83it/s]
epoch 8: 100%|██████████| 36997/36997 [00:00<00:00, 552682.50it/s]
epoch 9: 100%|██████████| 36997/36997 [00:00<00:00, 547211.75it/s]
epoch 10: 100%|██████████| 36997/36997 [00:00<00:00, 551259.05it/s]
epoch 11: 100%|██████████| 36997/36997 [00:00<00:00, 554390.47it/s]
epoch 12: 100%|██████████| 36997/36997 [00:00<00:00, 543449.83it/s]
epoch 13: 100%|██████████| 36997/36997 [00:00<00:00, 543430.80it/s]
epoch 14: 100%|██████████| 36997/36997 [00:00<00:00, 539962.79it/s]
epoch 15: 100%|██████████| 36997/36997 [00:00<00:00, 5503

In [15]:
# Testing the perceptron

recorder.record_testing_start()
y_pred = (np.dot(x_test_scaled, w) + b) > 0
recorder.record_testing_stop()

In [16]:
# Recording results

report = classification_report(y_test, 
                               y_pred, 
                               target_names = ["Not Bot", "Bot"])
print(report)
recorder.record_performance(y_true = y_test,
                            y_pred = y_pred,
                            target_names = ["Not Bot", "Bot"])
recorder.write("model_results.csv")

              precision    recall  f1-score   support

     Not Bot       0.95      0.96      0.96      3435
         Bot       0.96      0.95      0.96      3435

    accuracy                           0.96      6870
   macro avg       0.96      0.96      0.96      6870
weighted avg       0.96      0.96      0.96      6870

