In [85]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.utils import resample

In [86]:
df = pd.read_csv('labelled_train_set.csv')
df = df.dropna(subset=['News/Comment'])

In [87]:
df_minority = df[df['Type'] == 'MOSTLY TRUE']
df_minority_upsampled = resample(df_minority, replace=True, n_samples=2, random_state=42)
df_upsampled = pd.concat([df[df['Type'] != 'MOSTLY TRUE'], df_minority_upsampled])
df_upsampled = df_upsampled.drop(columns=['ID'])

In [88]:
def count_unique_words(text):
    words = text.split()
    unique_words = set(words)
    return len(unique_words)

def calculate_ttr(text):
    words = text.split()
    unique_words = set(words)
    if len(words) == 0:
        return 0
    
    ttr = len(unique_words)/len(words)
    return ttr 

In [89]:
le = LabelEncoder()
df_upsampled['Type'] = le.fit_transform(df_upsampled['Type'])
df_upsampled['length'] = df['News/Comment'].apply(len)
df_upsampled['unique_words'] = df_upsampled['News/Comment'].apply(count_unique_words)
df_upsampled['ttr_ratio'] = df_upsampled['News/Comment'].apply(calculate_ttr)


In [90]:
x = df_upsampled.drop(columns=['Type'])
y = df_upsampled['Type']
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [91]:
import re

In [92]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=2000,
    max_df=0.7,
    min_df=2
)
X_train_vec = vectorizer.fit_transform(x_train['News/Comment']).toarray()
X_test_vec = vectorizer.transform(x_test['News/Comment']).toarray()

In [93]:
additional_features_train = x_train[['length', 'unique_words', 'ttr_ratio']].values
additional_features_test = x_test[['length', 'unique_words', 'ttr_ratio']].values
X_train_combined = np.hstack([X_train_vec, additional_features_train])
X_test_combined = np.hstack([X_test_vec, additional_features_test])

In [94]:
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_res, y_train_res = smote.fit_resample(X_train_combined, y_train)

In [95]:
print('Original dataset shape:', np.bincount(y_train))
print('Resampled dataset shape:', np.bincount(y_train_res))

Original dataset shape: [902 110 184   2  30]
Resampled dataset shape: [902 902 902 902 902]


In [96]:
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test_combined)

In [97]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

In [98]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, f1_score, accuracy_score

print(classification_report(y_test, y_pred))
print('Macro F1 Score:', f1_score(y_test, y_pred, average='macro'))
print('Accuracy:', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.99      0.85       226
           1       1.00      0.04      0.07        28
           2       0.00      0.00      0.00        46
           4       0.00      0.00      0.00         8

    accuracy                           0.73       308
   macro avg       0.43      0.26      0.23       308
weighted avg       0.63      0.73      0.63       308

Macro F1 Score: 0.22856213402732597
Accuracy: 0.7305194805194806
