In [147]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
import numpy as np
import pandas as pd
import re
import timeit
# import plotly.express as px
# import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [148]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
valid_df = pd.read_csv('data/valid.csv')

#pre-cleaning EDA
print("---------------Pre-cleaning EDA---------------")
print("- Training data shape: ", train_df.shape)
print("- Training data null count:\n", train_df.isnull().sum())
print("- Training data duplicate count:", train_df.duplicated().sum())
counts = train_df['labels'].value_counts()
# fig1 = go.Figure(go.Bar(x=counts.index, y=counts.values))
# fig1.update_layout(title="Label counts")
# fig1.write_html("fig1.html")
train_df['text_length'] = train_df['text'].str.split().str.len()
# fig2 = px.histogram(train_df, x='text_length', nbins=50, title="Text Length Distribution")
# fig2.write_html("fig2.html")
# fig2.show()

#clean
def clean_text(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text

for df in [train_df, valid_df, test_df]:
    df['text'] = df['text'].apply(clean_text)
    df['title'] = df['title'].apply(clean_text)
    df['full'] = df['text'] + ' ' + df['title']
    df['full'] = df['full'].str.strip()
train_df = train_df.drop_duplicates(subset='full', keep='first')

#post-cleaning EDA
print("---------------Post-cleaning EDA---------------")
print("- Training data shape: ", train_df.shape)
print("- Training data null count:\n", train_df.isnull().sum())
print("- Training data duplicate count:", train_df.duplicated().sum())

#feature extract
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf.fit_transform(train_df['full'])
X_test = tfidf.transform(test_df['full'])
X_valid = tfidf.transform(valid_df['full'])
Y_train = np.array(train_df['labels'])
Y_test = np.array(test_df['labels'])
Y_valid = np.array(valid_df['labels'])

---------------Pre-cleaning EDA---------------
- Training data shape:  (21527, 4)
- Training data null count:
 title         0
text          0
year_month    0
labels        0
dtype: int64
- Training data duplicate count: 1
---------------Post-cleaning EDA---------------
- Training data shape:  (21526, 6)
- Training data null count:
 title          0
text           0
year_month     0
labels         0
text_length    0
full           0
dtype: int64
- Training data duplicate count: 0


# Multinomial Naive Bayes

In [149]:
class MultinomialNaiveBayes:
    def __init__(self, alpha = 1.0):
        self.alpha = alpha
        self.classes = None
        self.class_logprior = None      
        self.feature_logprob = None  
    
    def fit(self, X, Y):
        self.classes = np.unique(Y)

        temp = []
        for c in self.classes:   
            temp.append(np.sum(Y == c))
        class_cnt = np.array(temp)
        self.class_logprior = np.log(class_cnt / Y.shape[0])

        smoothed_fc = []
        for c in self.classes:
            X_c = X[Y == c]
            fc = np.array(X_c.sum(axis=0)).ravel() + self.alpha
            smoothed_fc.append(fc)
        smoothed_fc = np.array(smoothed_fc)
        norm = smoothed_fc.sum(axis=1).reshape(-1, 1)
        self.feature_logprob = np.log(smoothed_fc / norm)
        return self
    
    def predict(self, X):
        jll = X.dot(self.feature_logprob.T) + self.class_logprior
        return self.classes[np.argmax(jll, axis=1)]

In [None]:
modelMNB = MultinomialNaiveBayes()
time_taken = timeit.timeit(
    stmt="modelMNB.fit(X_train, Y_train)",
    setup="from __main__ import modelMNB, X_train, Y_train",
    number=50
)
modelMNB.fit(X_train, Y_train)
y_valid_predMNB = modelMNB.predict(X_valid)
print("- Multinomial NB accuracy valid: ", accuracy_score(Y_valid, y_valid_predMNB))
print("- Average training time:", time_taken / 50)
y_test_predMNB = modelMNB.predict(X_test)
print("- Test report: \n", classification_report(Y_test, y_test_predMNB))

#fine tuning
best_alpha = 0
best_acc = 0
best_pred = None
alphas = np.arange(1.5, 3.01, 0.05)
for a in alphas:
    modelMNB = MultinomialNaiveBayes(alpha=a)
    modelMNB.fit(X_train, Y_train)
    y_test_predMNB = modelMNB.predict(X_test)
    curr_acc = accuracy_score(Y_test, y_test_predMNB)
    if (curr_acc > best_acc):
        best_acc = curr_acc
        best_alpha = a
        best_pred = y_test_predMNB
print("- Multinomial NB accuracy test after tuning: ", best_acc)
print("-Best parameter: alpha =", best_alpha)
print("-Test report after tunning:\n", classification_report(Y_test, y_test_predMNB))

- Multinomial NB accuracy valid:  0.8670168318013609
- Average training time: 0.015014388000126928
- Test report: 
               precision    recall  f1-score   support

        fake       0.61      0.94      0.74      1655
        true       0.98      0.85      0.91      6723

    accuracy                           0.87      8378
   macro avg       0.79      0.89      0.82      8378
weighted avg       0.91      0.87      0.88      8378

- Multinomial NB accuracy test after tuning:  0.8704941513487706
-Best parameter: alpha = 2.500000000000001
-Test report after tunning:
               precision    recall  f1-score   support

        fake       0.61      0.93      0.74      1655
        true       0.98      0.85      0.91      6723

    accuracy                           0.87      8378
   macro avg       0.80      0.89      0.83      8378
weighted avg       0.91      0.87      0.88      8378



# K Nearest Neighbours

In [144]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.Y_train = None
    
    def fit(self, X, Y):
        self.X_train = X
        self.Y_train = Y
    
    @staticmethod
    def dist_pp(z, x):
        z = np.array(z.toarray()).ravel()
        x = np.array(x.toarray()).ravel()
        d = z - x
        return np.sum(d * d)
    
    @staticmethod
    def dist_ps_naive(z, X):
        N = X.shape[0]
        res = np.zeros((1, N))
        for i in range(N):
            res[0][i] = KNN.dist_pp(z, X[i])
        return res
    
    def predict1(self, x):
        dist = KNN.dist_ps_naive(x, self.X_train).ravel()
        k_indices = np.argsort(dist)[:self.k]
        k_nearest_labels = [self.Y_train[i] for i in k_indices]
        values, cnt = np.unique(k_nearest_labels, return_counts=True)
        return values[np.argmax(cnt)]
    
    def predict(self, X):
        return np.array([self.predict1(x) for x in X])

In [145]:
modelKNN = KNN()
modelKNN.fit(X_train, Y_train)
y_valid_predKNN = modelKNN.predict(X_valid)
print("KNN accuracy: ", accuracy_score(Y_valid, y_valid_predKNN))

KeyboardInterrupt: 