In [72]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install numba
import numpy as np
import pandas as pd
import re
import timeit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder
from numba import njit

In [73]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
valid_df = pd.read_csv('data/valid.csv')

#pre-cleaning EDA
# print("---------------Pre-cleaning EDA---------------")
# print("- Training data shape: ", train_df.shape)
# print("- Training data null count:\n", train_df.isnull().sum())
# print("- Training data duplicate count:", train_df.duplicated().sum())
counts = train_df['labels'].value_counts()
train_df['text_length'] = train_df['text'].str.split().str.len()

#clean
stopwords = [
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am',
    'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because',
    'been', 'before', 'being', 'below', 'between', 'both', 'but',
    'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't",
    'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
    'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has',
    "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll",
    "he's", 'her', 'here', "here's", 'hers', 'herself', 'him',
    'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm",
    "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its',
    'itself', "let's", 'me', 'more', 'most', "mustn't", 'my',
    'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only',
    'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out',
    'over', 'own', 'same', "shan't", 'she', "she'd", "she'll",
    "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than',
    'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves',
    'then', 'there', "there's", 'these', 'they', "they'd", "they'll",
    "they're", "they've", 'this', 'those', 'through', 'to', 'too',
    'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd",
    "we'll", "we're", "we've", 'were', "weren't", 'what', "what's",
    'when', "when's", 'where', "where's", 'which', 'while', 'who',
    "who's", 'whom', 'why', "why's", 'with', "won't", 'would',
    "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your',
    'yours', 'yourself', 'yourselves'
]

def remove_stopwords(text):
    if not text:
        return ""
    words = text.split()
    filtered = []
    for w in words:
        if w not in stopwords and len(w) > 2 and w.isalpha():
            filtered.append(w)
    return ' '.join(filtered)

def clean_text(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text

for df in [train_df, valid_df, test_df]:
    df['text'] = df['text'].apply(clean_text).apply(remove_stopwords)
    df['title'] = df['title'].apply(clean_text).apply(remove_stopwords)
    df['full'] = df['text'] + ' ' + df['title']
    df['full'] = df['full'].str.strip()
train_df = train_df.drop_duplicates(subset='full', keep='first')

#tackle labels imbalance
train_df = pd.concat([train_df, train_df[train_df['labels'] == 'true']], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

#post-cleaning EDA
# print("---------------Post-cleaning EDA---------------")
# print("- Training data shape: ", train_df.shape)
# print("- Training data null count:\n", train_df.isnull().sum())
# print("- Training data duplicate count:", train_df.duplicated().sum())

#feature extract
bow = CountVectorizer(stop_words=stopwords, max_features=5000)
X_train = bow.fit_transform(train_df['full']).toarray()
X_test = bow.transform(test_df['full']).toarray()
X_valid = bow.transform(valid_df['full']).toarray()
Y_train = np.array(train_df['labels'])
Y_test = np.array(test_df['labels'])
Y_valid = np.array(valid_df['labels'])

# print(X_train)



# Multinomial Naive Bayes

In [74]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.classes = None
        self.class_prior = None
        self.prob = None

    def fit(self, X, Y):
        self.classes = np.unique(Y)

        class_cnt = []
        for c in self.classes:
            class_cnt.append(len(Y[Y==c]))
        self.class_prior = np.log(np.array(class_cnt) / len(Y))

        words_cnt = []
        for c in self.classes:
            Xc = X[Y == c]
            cnt = np.array(Xc.sum(axis=0)).ravel() + self.alpha
            words_cnt.append(cnt)
        total = np.array(words_cnt).sum(axis=1).reshape(-1, 1)
        self.prob = np.log(words_cnt / total)
    
    def predict(self, X):
        prediction = X.dot(self.prob.T) + self.class_prior
        return self.classes[np.argmax(prediction, axis=1)]

In [75]:
modelMNB = MultinomialNaiveBayes()
time_taken = timeit.timeit(
    stmt="modelMNB.fit(X_train, Y_train)",
    setup="from __main__ import modelMNB, X_train, Y_train",
    number=50
)
modelMNB.fit(X_train, Y_train)
y_valid_predMNB = modelMNB.predict(X_valid)
print("- Multinomial NB accuracy valid: ", accuracy_score(Y_valid, y_valid_predMNB))
print("- Average training time:", time_taken / 50)
y_test_predMNB = modelMNB.predict(X_test)
print("- Test report: \n", classification_report(Y_test, y_test_predMNB))

#fine tuning
best_alpha = 0
best_acc = 0
best_pred = None
alphas = np.arange(1.5, 3.01, 0.05)
for a in alphas:
    modelMNB = MultinomialNaiveBayes(alpha=a)
    modelMNB.fit(X_train, Y_train)
    y_test_predMNB = modelMNB.predict(X_test)
    curr_acc = accuracy_score(Y_test, y_test_predMNB)
    if (curr_acc > best_acc):
        best_acc = curr_acc
        best_alpha = a
        best_pred = y_test_predMNB
print("- Multinomial NB accuracy test after tuning: ", best_acc)
print("-Best parameter: alpha =", best_alpha)
print("-Test report after tunning:\n", classification_report(Y_test, y_test_predMNB))

- Multinomial NB accuracy valid:  0.9023516772114122
- Average training time: 0.4288096660003066
- Test report: 
               precision    recall  f1-score   support

        fake       0.69      0.93      0.79      1655
        true       0.98      0.90      0.94      6723

    accuracy                           0.90      8378
   macro avg       0.83      0.91      0.86      8378
weighted avg       0.92      0.90      0.91      8378

- Multinomial NB accuracy test after tuning:  0.9031988541417999
-Best parameter: alpha = 2.8500000000000014
-Test report after tunning:
               precision    recall  f1-score   support

        fake       0.69      0.93      0.79      1655
        true       0.98      0.90      0.94      6723

    accuracy                           0.90      8378
   macro avg       0.84      0.91      0.86      8378
weighted avg       0.92      0.90      0.91      8378



# K-Means Clustering

In [None]:
@njit
def distance(x, y):
    if hasattr(x, 'toarray'):
        x = x.toarray()
    x = x.ravel()
    y = y.ravel()
    return np.sqrt(np.sum((x - y) ** 2))

def to_labels(clusters, y_train, k):
    mapping = {}
    for cl in range(k):
        mask = (clusters == cl)
        label_cnt = np.bincount(y_train[mask])
        if len(label_cnt == 0):
            mapping[cl] = -1
        else:
            mapping[cl] = np.argmax(label_cnt)
    prediction = np.zeros(len(y_train), dtype=int)
    for cl in range(k):
        mask = (clusters == cl)
        prediction[mask] = mapping[cl]
    return prediction

class KMC:
    def __init__(self, k=3, max_iters=100, tol=0.0001):
        self.k = k
        self.max_iters = max_iters
        self.tol = tol
        self.centroids = None
    
    def assign_clusters(self, X):
        temp = []
        for x in X:
            min_dist = 1000000000.0
            closest_cluster = -1
            for i, c in enumerate(self.centroids):
                curr_dist = distance(x, c)
                if curr_dist < min_dist:
                    min_dist = curr_dist
                    closest_cluster = i
            temp.append(closest_cluster)
        return np.array(temp)

    def compute_centroids(self, X, clusters):
        new_centroids = np.zeros((self.k, X.shape[1]))
        for i in range(self.k):
            points = X[clusters == i]
            if len(points) > 0:
                new_centroids[i] = points.mean(axis=0)
            else:
                new_centroids[i] = self.centroids[i]
        return new_centroids
        
    def fit(self, X):
        if hasattr(X, 'toarray'):
            X.toarray()
        centroids_id = np.random.choice(X.shape[0], size=self.k, replace=False)
        self.centroids = X[centroids_id]
        for i in range(self.max_iters):
            clusters = self.assign_clusters(X)
            new_centroids = self.compute_centroids(X, clusters)
            if np.linalg.norm(new_centroids - self.centroids) < self.tol:
                break
            self.centroids = new_centroids

    def predict(self, X):
        if hasattr(X, 'toarray'):
            X.toarray()
        return self.assign_clusters(X)

In [78]:
modelKMC = KMC(k=5)
modelKMC.fit(X_train)
clusters = modelKMC.predict(X_valid)
l = LabelEncoder()

encoded = l.fit_transform(Y_valid)
y_valid_predKMC = to_labels(clusters, encoded, k=3)
y_valid_predKMC_original = l.inverse_transform(y_valid_predKMC)
print("KMC accuracy valid: ", accuracy_score(Y_valid, y_valid_predKMC_original))

y_test_clusters = modelKMC.predict(X_test)
y_test_predKMC = to_labels(y_test_clusters, l.transform(Y_test), k=5)
y_test_predKMC_original = l.inverse_transform(y_test_predKMC)
print("- Test report: \n", classification_report(Y_test, y_test_predKMC_original))

#tuning
ks = np.arange(5, 9, 2)
best_k = 0
best_acc = 0
for k in ks:
    model = KMC(k=k)
    model.fit(X_train)
    clusters_valid = model.predict(X_valid)
    y_valid_pred = to_labels(clusters_valid, encoded, k=k)
    y_valid_pred_og = l.inverse_transform(y_valid_pred)
    curr_acc = accuracy_score(Y_valid, y_valid_pred_og)
    if curr_acc > best_acc:
        best_acc = curr_acc
        best_k = k

print("- KMC accuracy valid after tuning: ", best_acc)
print("Best k: ", best_k)

model_best = KMC(k=best_k)
model_best.fit(X_train)
clusters_test_best = model_best.predict(X_test)
y_test_pred_best = to_labels(clusters_test_best, l.transform(Y_test), k=best_k)
y_test_pred_best_original = l.inverse_transform(y_test_pred_best)

print("- Test report after tuning:\n", classification_report(Y_test, y_test_pred_best_original))

KeyboardInterrupt: 