In [1]:
# Package imports

from src.utils.recorder_util import ModelResults
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
# Initializing recorder results

model_name = "Single_Perceptron_Word2Vec"
author = "Noah Sher"
seed = 123

recorder = ModelResults(model_name, author, seed)

In [3]:
# Loading datasets

train_data = pd.read_csv("trainData/trainData.csv")
test_data = pd.read_csv("testData/testData.csv")

print("Unprocessed data:")
train_data.head()

Unprocessed data:


Unnamed: 0,label,source,text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...
1,0,1,The system of the Electoral College is a widel...
2,1,1,The renowned British statesman Winston Churchi...
3,0,1,"My grandfather would always say ""creativity is..."
4,1,1,In my pursuit to become an assistant manager a...


In [4]:
# Preprocessing text by:
#   1. lowercasing all letters and
#   2. splitting all strings into a list of words.

def preprocess_data(text):
    lower_text = text.lower()
    preprocessed = lower_text.split()
    return preprocessed

train_data["preprocessed_text"] = (train_data["text"]).apply(preprocess_data)
test_data["preprocessed_text"] = (test_data["text"]).apply(preprocess_data)

print("Preprocessed data:")
train_data.head()

Preprocessed data:


Unnamed: 0,label,source,text,preprocessed_text
0,0,1,Help wanted!\n\nThe Seagoing Cowboys program i...,"[help, wanted!, the, seagoing, cowboys, progra..."
1,0,1,The system of the Electoral College is a widel...,"[the, system, of, the, electoral, college, is,..."
2,1,1,The renowned British statesman Winston Churchi...,"[the, renowned, british, statesman, winston, c..."
3,0,1,"My grandfather would always say ""creativity is...","[my, grandfather, would, always, say, ""creativ..."
4,1,1,In my pursuit to become an assistant manager a...,"[in, my, pursuit, to, become, an, assistant, m..."


In [5]:
# Training Word2Vec on the train_data

word2Vec = Word2Vec(
    sentences = train_data["preprocessed_text"], 
    vector_size = 100, 
    window = 5,
    min_count = 1,
    workers = 4)

def get_embedding(text, model):
    vectors = []
    for word in text:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis = 0)
    else:
        return np.zeros(100)

In [6]:
# Converting texts to embeddings for train_data and test_data

x_train = np.array([get_embedding(text, word2Vec) 
                    for text in train_data["preprocessed_text"]])
y_train = (train_data["label"]).values

x_test = np.array([get_embedding(text, word2Vec) 
                    for text in test_data["preprocessed_text"]])
y_test = (test_data["label"]).values

In [7]:
# Scaling the model features 
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [8]:
# Initializing the Single Perceptron Model

model = Perceptron(max_iter = 1000, random_state = seed)

In [9]:
# Training on the scaled train_data

recorder.record_training_start()
model.fit(x_train_scaled, y_train)
recorder.record_training_stop()

In [10]:
# Predicting the test_data

recorder.record_testing_start()
y_prediction = model.predict(x_test_scaled)
recorder.record_testing_stop()

In [11]:
# Evaluating the performance of the model
target_names = ["Not Bot", "Bot"]
recorder.record_performance(
    y_true = y_test,
    y_pred = y_prediction, 
    target_names = target_names)

NameError: name 'y_pred' is not defined

In [None]:
# Output and saving results

recorder.write("model_results.csv")
print("Performance Summary:")
print(recorder.performance)