In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from scipy.sparse import hstack
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from keras.optimizers import SGD, Adam, RMSprop
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.regularizers import l2
import keras_tuner as kt
import joblib

# Prepare Data

## Load Preprocessed Data

In [2]:
data = pd.read_csv('data/fulltext_preprocessed_data.csv')
data.head()

Unnamed: 0,full_text,full_tokens,full_text_lemmatized,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,"['house', 'aide', 'even', 'see', 'letter', 'ja...",house aide even see letter jason house aide ev...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...","['hillary', 'clinton', 'big', 'woman', 'campus...",hillary clinton big woman campus ever get feel...,0
2,Why the Truth Might Get You Fired Why the Trut...,"['truth', 'might', 'get', 'fired', 'truth', 'm...",truth might get fired truth might get fired oc...,1
3,15 Civilians Killed In Single US Airstrike Hav...,"['single', 'single', 'rate', 'american', 'high...",single single rate american higher engaged act...,1
4,Iranian woman jailed for fictional unpublished...,"['iranian', 'woman', 'fictional', 'unpublished...",iranian woman fictional unpublished story woma...,1


In [3]:
data.shape

(20546, 4)

In [4]:
style_df = pd.read_csv('data/style_features.csv').drop(columns='label')
style_df.head()

Unnamed: 0,average_word_length,average_sentence_length,vocabulary_richness,url_ratio,all_caps_ratio,exclamations_ratio,questions_ratio,digits_ratio,flesch_reading_ease_score,noun_ratio,verb_ratio,adjective_ratio,adverb_ratio,pronoun_ratio,proper_noun_ratio,punctuation_ratio
0,4.737278,25.131579,0.463905,0.0,0.011834,0.0,0.0,0.004387,48.159824,0.136095,0.126627,0.057988,0.057988,0.102959,0.16568,0.020738
1,4.747504,25.030303,0.543509,0.0,0.009986,0.0,0.000954,0.007393,54.114539,0.194009,0.119829,0.064194,0.052782,0.11127,0.10271,0.027427
2,4.934295,25.224138,0.469551,0.00641,0.00641,0.0,0.000519,0.002986,45.803364,0.196314,0.111378,0.085737,0.048878,0.081731,0.103365,0.028171
3,4.732852,23.222222,0.453069,0.001805,0.034296,0.0,0.0,0.009747,52.0866,0.209386,0.120939,0.075812,0.027076,0.050542,0.124549,0.018276
4,5.104294,37.6,0.595092,0.0,0.0,0.0,0.0,0.005848,27.212727,0.257669,0.153374,0.04908,0.01227,0.067485,0.092025,0.021442


In [5]:
style_df.shape

(20546, 16)

In [6]:
X = data['full_text_lemmatized'].astype(str)
y = data['label']

## Train-Test Split

In [7]:
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_text, X_val_text, y_train, y_val = train_test_split(X_train_text, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [8]:
X_train_style = style_df.loc[X_train_text.index].values
X_val_style = style_df.loc[X_val_text.index].values
X_test_style = style_df.loc[X_test_text.index].values

## TF-IDF Features

In [9]:
tfidf = joblib.load('artifacts/tfidf_vectorizer.pkl')

In [10]:
X_train_tfidf = tfidf.transform(X_train_text)
X_val_tfidf = tfidf.transform(X_val_text)
X_test_tfidf = tfidf.transform(X_test_text)

In [11]:
X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape

((13148, 50000), (3288, 50000), (4110, 50000))

# ANN

In [12]:
EPOCHS=500
BATCH_SIZE=128
PATIENCE_EARLYSTOP=10
PATIENCE_REDUCELR=5

## TF-IDF Only

In [13]:
model = Sequential()
model.add(Input(shape=(50000,)))
model.add(Dense(2048, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(1024, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(512, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.001)))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [14]:
estop = EarlyStopping(monitor='val_loss', mode='min',
                      min_delta=1e-5, patience=PATIENCE_EARLYSTOP,
                      restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=PATIENCE_REDUCELR,
                              min_delta=1e-3, min_lr=1e-5, verbose=1)
model.fit(X_train_tfidf, y_train, 
          validation_data=(X_val_tfidf, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          callbacks=[estop, reduce_lr], verbose=1)

Epoch 1/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 704ms/step - accuracy: 0.8993 - loss: 1.2001 - val_accuracy: 0.9526 - val_loss: 0.5630 - learning_rate: 0.0010
Epoch 2/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 717ms/step - accuracy: 0.9693 - loss: 0.4571 - val_accuracy: 0.9501 - val_loss: 0.4942 - learning_rate: 0.0010
Epoch 3/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 726ms/step - accuracy: 0.9721 - loss: 0.4081 - val_accuracy: 0.9419 - val_loss: 0.4732 - learning_rate: 0.0010
Epoch 4/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 721ms/step - accuracy: 0.9806 - loss: 0.3249 - val_accuracy: 0.9392 - val_loss: 0.4404 - learning_rate: 0.0010
Epoch 5/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 710ms/step - accuracy: 0.9869 - loss: 0.2653 - val_accuracy: 0.9492 - val_loss: 0.3671 - learning_rate: 0.0010
Epoch 6/500
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━

KeyboardInterrupt: 

In [None]:
test_loss, test_accuracy = model.evaluate(X_test_tfidf, y_test)
print("Test accuracy:", test_accuracy)
print("Test loss:", test_loss)

In [None]:
y_pred = np.array([[1] if pred>=0.5 else [0] for pred in model.predict(X_test_tfidf)])

In [None]:
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(3,3))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['real','fake'])
disp.plot(cmap='Blues', colorbar=False, ax=ax)
plt.title(f'Confusion matrix for ANN Model using only TF-IDF Features')
plt.show()

## TF-IDF + Style Features

In [None]:
scaler = MinMaxScaler()
X_train_style_scaled = scaler.fit_transform(X_train_style)
X_val_style_scaled = scaler.transform(X_val_style)
X_test_style_scaled = scaler.transform(X_test_style)

In [None]:
# combining sparse tfidf with dense scaled style features
X_train_combined_scaled = hstack([X_train_tfidf, X_train_style_scaled])
X_val_combined_scaled = hstack([X_val_tfidf, X_val_style_scaled])
X_test_combined_scaled = hstack([X_test_tfidf, X_test_style_scaled])

In [None]:
X_train_combined_scaled.shape, X_val_combined_scaled.shape, X_test_combined_scaled.shape

In [None]:
model2 = Sequential()
model2.add(Input(shape=(50016,)))
model2.add(Dense(2048, activation='relu', kernel_regularizer=l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(1024, activation='relu', kernel_regularizer=l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(512, activation='relu', kernel_regularizer=l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.001)))
model2.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy',
              metrics=['accuracy'])
model2.summary()

In [None]:
estop = EarlyStopping(monitor='val_loss', mode='min',
                      min_delta=1e-5, patience=PATIENCE_EARLYSTOP,
                      restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=PATIENCE_REDUCELR,
                              min_delta=1e-3, min_lr=1e-5, verbose=1)
model2.fit(X_train_tfidf, y_train, 
          validation_data=(X_val_tfidf, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          callbacks=[estop, reduce_lr], verbose=1)

In [None]:
test_loss, test_accuracy = model2.evaluate(X_test_tfidf, y_test)
print("Test accuracy:", test_accuracy)
print("Test loss:", test_loss)

In [None]:
y_pred2 = np.array([[1] if pred>=0.5 else [0] for pred in model2.predict(X_test_tfidf)])

In [None]:
print(classification_report(y_test, y_pred2))
cm = confusion_matrix(y_test, y_pred2)
fig, ax = plt.subplots(figsize=(3,3))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['real','fake'])
disp.plot(cmap='Blues', colorbar=False, ax=ax)
plt.title(f'Confusion matrix for ANN Model using only TF-IDF Features')
plt.show()

## Save Best Model

In [21]:
max_accuracy = float('-inf')
best_model_name = None
for name, model_info in models_trained.items():
    if model_info['test_accuracy'] > max_accuracy:
        max_accuracy = model_info['test_accuracy']
        best_model_name = name
print(f"Best baseline model: {best_model_name}\nBest test accuracy: {max_accuracy}")

Best baseline model: Linear SVM Combined
Best test accuracy: 0.9652068126520681


* As understood from the distribution plots for the stylometric features, they don't contribute significantly in the classification of news, but they do provide minute information which have helped all the 3 models perform slightly better.
* With both TF-IDF only and combined datasets, Linear SVC performed the best in terms of accuracy and average precision, recall, and f1-score.
* The improvement observed after including the scaled stylometric features were almost negligible for logistic regression and linear svc (0.1% and 0.2% increase, respectively), but slightly appreciable for naive bayes (2.5% increase). Naive bayes was also better at detecting real news than the other 2 models, but not so at identifying the fake ones.

In [22]:
# joblib.dump(models_trained[best_model_name]['fitted_model'], 'artifacts/fake_news_baseline_classifier.pkl')

['artifacts/fake_news_baseline_classifier.pkl']