# 2 Single 2 Perceptron

In [1]:
# Package imports

from src.utils.recorder_util import ModelResults
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from nltk.tokenize import word_tokenize

In [2]:
# Initializing recorder results

model_name = "2_Single_2Perceptron_Word2Vec"
author = "Noah Sher"
seed = 123

recorder = ModelResults(model_name, author, seed)

In [3]:
# Loading datasets

train_data = pd.read_csv("trainData/trainData.csv")
test_data = pd.read_csv("testData/testData.csv")

print("Unprocessed data:")
train_data.head()

Unprocessed data:


Unnamed: 0,label,source,text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...
1,0,1,The system of the Electoral College is a widel...
2,1,1,The renowned British statesman Winston Churchi...
3,0,1,"My grandfather would always say ""creativity is..."
4,1,1,In my pursuit to become an assistant manager a...


In [4]:
# Preprocessing text by:
#   1. lowercasing all letters and
#   2. splitting all strings into a list of words.

def preprocess_data(text):
    lower_text = text.lower()
    preprocessed = lower_text.split()
    return preprocessed

train_data["preprocessed_text"] = train_data["text"].apply(preprocess_data)
test_data["preprocessed_text"] = test_data["text"].apply(preprocess_data)

print("Preprocessed data:")
train_data.head()

Preprocessed data:


Unnamed: 0,label,source,text,preprocessed_text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...,"[help, wanted!, the, seagoing, cowboys, progra..."
1,0,1,The system of the Electoral College is a widel...,"[the, system, of, the, electoral, college, is,..."
2,1,1,The renowned British statesman Winston Churchi...,"[the, renowned, british, statesman, winston, c..."
3,0,1,"My grandfather would always say ""creativity is...","[my, grandfather, would, always, say, ""creativ..."
4,1,1,In my pursuit to become an assistant manager a...,"[in, my, pursuit, to, become, an, assistant, m..."


In [5]:
# Training Word2Vec on the train_data

word2Vec = Word2Vec(
    sentences = train_data["preprocessed_text"],
    vector_size = 100,
    window = 5,
    min_count = 1,
    workers = 4
)

def embeddings(text, model):
    vectors = []
    for word in text:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis = 0)
    else:
        return np.zeros(model.vector_size)

In [6]:
# Converting training texts to embeddings

x_train = []
for text in train_data["preprocessed_text"]:
    embedding = embeddings(text, word2Vec)
    x_train.append(embedding)
x_train = np.array(x_train)
y_train = train_data["label"].values

In [7]:
# Converting testing texts to embeddings

x_test = []
for text in test_data["preprocessed_text"]:
    embedding = embeddings(text, word2Vec)
    x_test.append(embedding)
x_test = np.array(x_test)
y_test = test_data["label"].values

In [8]:
# Scaling the data

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [9]:
# initializing weights and bias

np.random.seed(seed)
n_examples, n_features = x_train_scaled.shape
w = np.zeros(n_features)
b = 0

In [10]:
# Training the perceptron

n_epochs = 10
indices = np.arange(n_examples)
recorder.record_training_start()
for epoch in range(n_epochs):
    n_errors = 0
    np.random.shuffle(indices)
    for i in tqdm(indices, desc=f'epoch {epoch + 1}'):
        x = x_train_scaled[i]
        y_true = y_train[i]
        score = x @ w + b
        y_pred = 1 if score > 0 else 0
        if y_true != y_pred:
            if y_true == 1:
                w += x
                b += 1
            else:
                w -= x
                b -= 1
            n_errors += 1
    print(f"Epoch {epoch + 1}: Errors = {n_errors}")
    if n_errors == 0:
        break
recorder.record_training_stop()

epoch 1: 100%|██████████| 36997/36997 [00:00<00:00, 459294.16it/s]


Epoch 1: Errors = 1211


epoch 2: 100%|██████████| 36997/36997 [00:00<00:00, 466970.60it/s]


Epoch 2: Errors = 996


epoch 3: 100%|██████████| 36997/36997 [00:00<00:00, 465262.47it/s]


Epoch 3: Errors = 987


epoch 4: 100%|██████████| 36997/36997 [00:00<00:00, 465018.47it/s]


Epoch 4: Errors = 914


epoch 5: 100%|██████████| 36997/36997 [00:00<00:00, 465752.63it/s]


Epoch 5: Errors = 979


epoch 6: 100%|██████████| 36997/36997 [00:00<00:00, 464684.27it/s]


Epoch 6: Errors = 919


epoch 7: 100%|██████████| 36997/36997 [00:00<00:00, 467234.94it/s]


Epoch 7: Errors = 926


epoch 8: 100%|██████████| 36997/36997 [00:00<00:00, 464559.07it/s]


Epoch 8: Errors = 925


epoch 9: 100%|██████████| 36997/36997 [00:00<00:00, 466428.80it/s]


Epoch 9: Errors = 920


epoch 10: 100%|██████████| 36997/36997 [00:00<00:00, 463636.04it/s]

Epoch 10: Errors = 922





In [11]:
# Testing the perceptron

recorder.record_testing_start()
y_pred = (np.dot(x_test_scaled, w) + b) > 0
recorder.record_testing_stop()

In [12]:
def binary_classification_report(y_true, y_pred):
    # count true positives, false positives, true negatives, and false negatives
    tp = fp = tn = fn = 0
    for gold, pred in zip(y_true, y_pred):
        if pred == True:
            if gold == True:
                tp += 1
            else:
                fp += 1
        else:
            if gold == False:
                tn += 1
            else:
                fn += 1
    # calculate precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    # calculate f1 score
    fscore = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    # calculate accuracy
    accuracy = (tp + tn) / len(y_true)
    # number of positive labels in y_true
    support = sum(y_true)
    return {
        "precision": precision,
        "recall": recall,
        "f1-score": fscore,
        "support": support,
        "accuracy": accuracy,
    }

In [13]:
# Recording results

report = binary_classification_report(y_test == 1, y_pred)
print(report)
recorder.record_performance(
    y_true=y_test,
    y_pred=y_pred,
    target_names=["Not Bot", "Bot"],
)
recorder.write("model_results.csv")

{'precision': 0.9792215393620135, 'recall': 0.9740902474526929, 'f1-score': 0.9766491535318155, 'support': 3435, 'accuracy': 0.9767103347889374}
