## Setup

In [2]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB
# REQUIREMENTS FOR SETUP.SH:
# python 3.11.8
# pip 23.3.1

import pandas as pd
import pickle
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import re
import numpy as np
import nltk
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)
enable_features_from_savefile = True
enable_validation_data = False
enable_model_from_savefile = True
use_external_dataset = True

## Pre-processing

In [4]:
df = pd.read_csv('../raw_data/synonym_augmented_train.csv', header=None, index_col = False)
# df.head()

In [5]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

# print(X_train)
# print(y_train)

# print(len(X_train))
# print(len(y_train))

# y_train.value_counts()

### Train-Validation Split

In [None]:
# if enable_validation_data:
#     X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

### Doc2Vec

In [6]:
X_train = X_train.apply(word_tokenize)
X_train.head()

0    [A, little, less, than, a, decade, ago, hockey...
1    [The, writers, of, the, HBO, series, The, Sopr...
2    [Despite, claims, from, the, TV, news, outlet,...
3    [After, receiving, subpar, service, and, exper...
4    [After, watching, his, beloved, Seattle, Marin...
Name: 1, dtype: object

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]

In [8]:
d2v = Doc2Vec(epochs=10, seed=SEED, workers=4)
d2v.build_vocab(documents)
d2v.train(documents, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [19]:
model = KMeans(n_clusters=10*4, init='k-means++', random_state=SEED)
model.fit(d2v.dv.vectors)

In [68]:
model = KNeighborsClassifier(n_neighbors=20)
model.fit(d2v.dv.vectors, y_train)

In [20]:
# cluster : label mappings
cluster_labels = {}
for i, label in enumerate(model.labels_):
    if label not in cluster_labels:
        cluster_labels[label] = []
    cluster_labels[label].append(y_train[i])

for cluster, cluster_data_labels in cluster_labels.items():
    majority_label = Counter(cluster_data_labels).most_common(1)[0][0]
    cluster_labels[cluster] = majority_label


## Validation

## Test Data

In [11]:
# TEST DATA 
if use_external_dataset:
    test_df = pd.read_csv('../external-dataset/opensources_fakenewscorpus_modified_undersampled.csv', header=None, index_col = False)
else:
    test_df = pd.read_csv('../raw_data/balancedtest.csv', index_col = False)
    test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE, random_state=SEED)

# print(test_df)
print(test_df.head())

   0                                                  1
0  1  Overpaid council boss refreshingly open about ...
1  1  "Hazing is an important part of joining any ex...
2  1  A former #doctor with the #Bronx-Lebanon #Hosp...
3  1  Pop music sensation, #Justin Bieber, 23, has h...
4  1  DISAPPOINTED Daniel Bryan fans have been asked...


In [12]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

# print(X_test.head())
# print(y_test.head())

In [13]:
X_test = X_test.apply(word_tokenize)

### Doc2Vec

In [69]:
X_test_final = X_test.apply(d2v.infer_vector)
X_test_final.head()

0    [-0.17470889, -0.7265898, 0.092705764, -0.3004...
1    [0.21645771, 0.103275545, 0.48571834, -0.20144...
2    [0.40915078, -0.101906, -0.40269232, -0.732907...
3    [0.5386072, -0.0770307, 0.3462495, -0.42573756...
4    [0.45966715, 0.11958309, -0.16861692, -0.75403...
Name: 1, dtype: object

### Metrics

#### Accuracy, precision, recall, F1

In [70]:
# obtain predictions on test data
y_pred = model.predict(np.stack(X_test_final.values))
print(pd.Series(y_pred).value_counts())

4    5543
3    5105
1    2042
2    1310
Name: count, dtype: int64


In [65]:
# map cluster assignments to labels
y_pred = np.array(list(map(lambda x: cluster_labels[x], y_pred)))
print(pd.Series(y_pred).value_counts())

In [71]:
# evaluate model training metrics with macro f1 score
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
test_accuracy = accuracy_score(y_test, y_pred)

In [72]:
print(f'Test Accuracy: {test_accuracy:.8f},\tTest Precision: {test_precision:.8f},\tTest Recall: {test_recall:.8f},\tTest F1: {test_f1:.8f}')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

pd.Series(y_pred).value_counts()

Test Accuracy: 0.41035714,	Test Precision: 0.36296636,	Test Recall: 0.41035714,	Test F1: 0.36706886
Class 0:	Test Precision: 0.59745348,	Test Recall: 0.34857143,	Test f1: 0.44027427
Class 1:	Test Precision: 0.00152672,	Test Recall: 0.00057143,	Test f1: 0.00083160
Class 2:	Test Precision: 0.46699314,	Test Recall: 0.68114286,	Test f1: 0.55409646
Class 3:	Test Precision: 0.38589212,	Test Recall: 0.61114286,	Test f1: 0.47307310


4    5543
3    5105
1    2042
2    1310
Name: count, dtype: int64