In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px
import nltk
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from gensim.models import FastText
from xgboost import XGBClassifier

# Reading the dataset

In [None]:
#Read the training set
training_df = pd.read_csv('./dataset/training_data.csv',sep='\t', header=None)

#Add meaningful column names
training_df.columns = ['fake_news_flag','news_content']

#Re-order the columns
training_df = training_df[['news_content','fake_news_flag']]

training_df

In [None]:
# Get value counts of fake_news_flag
value_counts = training_df['fake_news_flag'].value_counts()

# Set the plot style
sns.set_style("whitegrid")

# Create a bar plot
plt.figure(figsize=(8, 6))
ax = sns.barplot(x=value_counts.index, y=value_counts.values, palette="coolwarm", hue=value_counts.index, legend=False)

# Add count labels on top of each bar
for i, value in enumerate(value_counts.values):
    plt.text(i, value + 0.5, str(value), ha='center', va='bottom', fontsize=15)

# Add title and labels
plt.title('Distribution of Fake News Flags', fontsize=16)
plt.xlabel('News Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Customize tick labels
plt.xticks(ticks=range(len(value_counts.index)), labels=["Fake", "Not Fake"], fontsize=10)
plt.yticks(fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


# Data Cleaning

In [None]:
def fn_clean_text(texts):
    docs = nlp.pipe(texts)  # Batch processing
    cleaned_texts = [
        ' '.join([token.text for token in doc if token.is_alpha and not token.is_stop])
        for doc in docs
    ]
    return cleaned_texts

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Convert the case to lower
training_df['news_content'] = training_df['news_content'].str.lower()

# Apply the cleaning function
training_df['news_content_clean'] = fn_clean_text(training_df['news_content'])

#Count the number of words in each news
training_df['num_words'] = training_df['news_content_clean'].str.split(' ').str.len()

# Display the result
training_df


# Data Split

In [None]:
# Assuming `training_df` has 'news_content_clean' and 'label' columns
X = training_df['news_content_clean']
y = training_df['fake_news_flag']

# Train-Test Split
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, training_df.index, test_size=0.3, random_state=42)
print("Training Size : {}".format(X_train.shape[0]))
print("Test Size : {}".format(X_test.shape[0]))

# Text Vectorization - Tf-Idf vs FastText

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.fit_transform(X_test)

In [None]:
# Tokenize train and test sets separately to avoid leakage
train_corpus = X_train.apply(str.split).tolist()
test_corpus = X_test.apply(str.split).tolist()

#Train FastText model on the training data only
fasttext_model = FastText(sentences=train_corpus, vector_size=100, window=5, min_count=1, epochs=10)

# Step 4: Create document embeddings
def get_document_embedding_fasttext(doc, model):
    """
    Generate a document embedding by averaging word vectors.
    If a word is not in the model, it is ignored.
    """
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)

# Transform train and test sets into embeddings
X_train_embeddings = np.array([get_document_embedding_fasttext(doc, fasttext_model) for doc in train_corpus])
X_test_embeddings = np.array([get_document_embedding_fasttext(doc, fasttext_model) for doc in test_corpus])

# Different models test

## Random Forest

In [None]:
# Random Forest with TF-IDF
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = clf.predict(X_test_tfidf)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred))

# Random Forest with FastText Embeddings
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_embeddings, y_train)

# Evaluate the model
y_pred = clf.predict(X_test_embeddings)
accuracy = accuracy_score(y_test, y_pred)

print("FastText Accuracy:", accuracy)

## XGBoost

In [None]:
# XGBoost with TF-IDF
xgb = XGBClassifier(n_estimators=500, learning_rate = 0.1, random_state=42, reg_lambda=1)
xgb.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = xgb.predict(X_test_tfidf)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred))

xgb = XGBClassifier(n_estimators=500, learning_rate = 0.1, random_state=42, reg_lambda=1)
xgb.fit(X_train_embeddings, y_train)

# Evaluate the model
y_pred = xgb.predict(X_test_embeddings)
accuracy = accuracy_score(y_test, y_pred)

print("FastText Accuracy:", accuracy)

## LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier
    
lgb = LGBMClassifier(n_estimators=500, learning_rate = 0.1, random_state=42, reg_lambda=1, verbose=0)
lgb.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = lgb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print("TF_IDF Accuracy:", accuracy)

lgb = LGBMClassifier(n_estimators=500, learning_rate = 0.1, random_state=42, reg_lambda=1, verbose=0)
lgb.fit(X_train_embeddings, y_train)

# Evaluate the model
y_pred = lgb.predict(X_test_embeddings)
accuracy = accuracy_score(y_test, y_pred)

print("FastText Accuracy:", accuracy)

# Model Classifier

In [None]:
#Final Tuned Classical Model
xgb = XGBClassifier(n_estimators=500, 
                    learning_rate = 0.1,
                    reg_lambda = 1,
                    random_state=42)

xgb.fit(X_train_embeddings, y_train)

#Predict
y_pred = xgb.predict(X_test_embeddings)

In [None]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
cf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

class_labels = ['Fake News', 'Real News']

# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()



In [None]:
# Create a DataFrame for the test set with predictions
test_results = pd.DataFrame({
    'original_index': idx_test,
    'news_content': X_test,
    'true_label': y_test,
    'predicted_label': y_pred
})

test_results = test_results[['news_content','true_label','predicted_label']].reset_index()

In [None]:
# what kind of emotions do fake news evoke?
# What type of information is generally faked?

# Hugging Face Classifier

In [None]:
from transformers import pipeline
MODEL = "jaranohaal/distilbert-base-uncased-finetuned-fake-news"
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL)

In [None]:
#Predict using pre-trained model
y_pred = clf(X_test.to_list())

In [None]:
#Convert the dataset
predictions = pd.DataFrame(y_pred)
predictions['predicted_label'] =  predictions['label'].apply(lambda x:int(x[-1]))
predictions = predictions[['predicted_label','score']]


#Merge the results
test_results = pd.DataFrame({'news_content': X_test,
                             'true_label': y_test}).reset_index(drop=True)

test_results = pd.concat([test_results,predictions],axis=1)

test_results

In [None]:
# Evaluate
y_test = test_results['true_label']
y_pred = test_results['predicted_label']

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
cf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

class_labels = ['Fake News', 'Real News']

# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()



# Sentiment Analysis of Fake News

In [None]:
fake_news_df = training_df['news_content_clean'][training_df['fake_news_flag'] == 0]
fake_news_df

In [None]:
import tensorflow as tf

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="cirimus/modernbert-large-go-emotions", return_all_scores=True)

sentences = ["I am not having a great day"]

model_outputs = classifier(sentences)
print(model_outputs[0])
# produces a list of dicts for each of the labels
