In [1]:
# Chunk 1: Environment Setup and Data Loading

# --- 1.1: Install required libraries (uncomment if running first time) ---
# !pip install pandas numpy sklearn matplotlib seaborn nltk tqdm

# --- 1.2: Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import nltk
from sklearn.model_selection import train_test_split

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# --- 1.3: Load Dataset ---
# Replace with your actual path if local file is available
try:
    df = pd.read_csv('final_data.csv')
except FileNotFoundError:
    # Try loading from an online source (fallback)
    df = pd.read_csv("https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/welfake.csv")

# --- 1.4: Quick View of Dataset ---
print("Shape of dataset:", df.shape)
df.head()


[nltk_data] Downloading package punkt to /home/lab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Shape of dataset: (99531, 6)


Unnamed: 0,title,text,Label,text_length,word_count,title_length
0,donald trump sends out embarrassing new year e...,donald trump just could not wish all american ...,0,2283,385,72
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,0,1673,248,68
2,sheriff david clarke becomes internet joke for...,friday wa revealed that former milwaukee sheri...,0,2643,422,78
3,trump obsessed even ha obama name coded into h...,christmas day donald trump announced that woul...,0,2095,338,62
4,pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...,0,1990,332,69


In [2]:
# Chunk 2: Data Preprocessing

import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm



# --- 2.1: Text Cleaning Function ---
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 2.2: Stopword Removal and Lemmatization ---
# Define custom list of important words to preserve
important_words = {
    'not', 'no', 'never', 'nothing', 'nowhere', 'none', 'nobody',
    'would', 'could', 'should', 'will', 'was', 'is', 'are',
    'you', 'we', 'he', 'they', 'your', 'his', 'her', 'their'
}

# Load NLTK stopwords and remove important words
stop_words = set(stopwords.words('english')) - important_words
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Clean
    text = clean_text(text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    processed = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(processed)

# --- 2.3: Apply Preprocessing to the Dataset ---

# Fill missing text/title values with empty strings (handles NaN)
if {'title', 'text'}.issubset(df.columns):
    df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
else:
    df['content'] = df['text'].fillna('')

# Drop duplicate content rows if any
df.drop_duplicates(subset='content', inplace=True)

# Show a sample before preprocessing
print("Before preprocessing:")
print(df['content'].iloc[0])

# Add tqdm progress bar for preprocessing
tqdm.pandas()
df['processed'] = df['content'].progress_apply(preprocess_text)

# Show the result
print("\nAfter preprocessing:")
print(df['processed'].iloc[0])



Before preprocessing:
donald trump sends out embarrassing new year eve message this disturbing donald trump just could not wish all american happy new year and leave that instead had give shout out his enemy hater and the very dishonest fake news medium the former reality show star had just one job and could not our country rapidly grows stronger and smarter want wish all friend supporter enemy hater and even the very dishonest fake news medium happy and healthy new year president angry pant tweeted 2018 will great year for america our country rapidly grows stronger and smarter want wish all friend supporter enemy hater and even the very dishonest fake news medium happy and healthy new year 2018 will great year for america donald trump december 2017trump tweet went down about welll you expect what kind president sends new year greeting like this despicable petty infantile gibberish only trump his lack decency will not even allow him rise above the gutter long enough wish the american c

100%|██████████| 99531/99531 [01:52<00:00, 887.34it/s] 


After preprocessing:
donald trump sends embarrassing new year eve message disturbing donald trump could not wish american happy new year leave instead give shout his enemy hater dishonest fake news medium former reality show star one job could not country rapidly grows stronger smarter want wish friend supporter enemy hater even dishonest fake news medium happy healthy new year president angry pant tweeted will great year america country rapidly grows stronger smarter want wish friend supporter enemy hater even dishonest fake news medium happy healthy new year will great year america donald trump december trump tweet went welll you expect kind president sends new year greeting like despicable petty infantile gibberish trump his lack decency will not even allow rise gutter long enough wish american citizen happy new year bishop talbert swan december no one like you calvin december your impeachment would make great year america will also accept regaining control congress miranda yaver d




In [13]:
# Chunk 3: Linguistic Feature Extraction (LFS)

import textstat
import spacy
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

# --- 3.1: Function to extract linguistic features ---
def extract_linguistic_features(text):
    doc = nlp(text)
    num_words = len([token for token in doc if token.is_alpha])
    num_chars = sum(len(token.text) for token in doc if token.is_alpha)
    avg_word_length = np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0
    num_sentences = len(list(doc.sents))
    num_nouns = len([token for token in doc if token.pos_ == "NOUN"])
    num_verbs = len([token for token in doc if token.pos_ == "VERB"])
    num_adj = len([token for token in doc if token.pos_ == "ADJ"])
    num_adv = len([token for token in doc if token.pos_ == "ADV"])
    num_entities = len(doc.ents)
    flesch_read = textstat.flesch_reading_ease(text)

    return [
        num_words,
        num_chars,
        avg_word_length,
        num_sentences,
        num_nouns,
        num_verbs,
        num_adj,
        num_adv,
        num_entities,
        flesch_read
    ]

# --- 3.2: Extract features for all rows ---
linguistic_features = []
for text in tqdm(df['processed'], desc="Extracting LFS"):
    linguistic_features.append(extract_linguistic_features(text))

linguistic_features = np.array(linguistic_features)

# --- 3.3: Convert to DataFrame ---
lf_columns = [
    'word_count', 'char_count', 'avg_word_length', 'sentence_count',
    'noun_count', 'verb_count', 'adj_count', 'adv_count', 'ner_count', 'flesch_readability'
]
lf_df = pd.DataFrame(linguistic_features, columns=lf_columns)



Extracting LFS: 100%|██████████| 99531/99531 [37:46<00:00, 43.92it/s]  


KeyError: 'label'

In [14]:
# --- 3.4: Feature selection using Pearson correlation (via ANOVA F-test as proxy) ---
y = df['Label'].astype(int)  # Ensure labels are integers
selector = SelectKBest(score_func=f_classif, k='all')  # We'll filter manually
selector.fit(lf_df, y)
scores = selector.scores_

# Attach scores to features and sort
feature_scores = pd.DataFrame({'feature': lf_columns, 'score': scores})
feature_scores.sort_values(by='score', ascending=False, inplace=True)

print("Feature correlation scores:")
print(feature_scores)

# --- 3.5: Create 3 odd LFS sets for voting ---
sorted_features = feature_scores['feature'].tolist()

LFS1 = lf_df[[sorted_features[i] for i in range(0, len(sorted_features), 3)]]
LFS2 = lf_df[[sorted_features[i] for i in range(1, len(sorted_features), 3)]]
LFS3 = lf_df[[sorted_features[i] for i in range(2, len(sorted_features), 3)]]

# Normalize feature sets
scaler = StandardScaler()
LFS1_scaled = pd.DataFrame(scaler.fit_transform(LFS1), columns=LFS1.columns)
LFS2_scaled = pd.DataFrame(scaler.fit_transform(LFS2), columns=LFS2.columns)
LFS3_scaled = pd.DataFrame(scaler.fit_transform(LFS3), columns=LFS3.columns)

# Save for later use
df_lfs1 = LFS1_scaled
df_lfs2 = LFS2_scaled
df_lfs3 = LFS3_scaled

print("\nSample LFS1:")
print(df_lfs1.head())


Feature correlation scores:
              feature       score
5          verb_count  566.117891
8           ner_count  512.139475
0          word_count  459.640822
9  flesch_readability  449.838762
4          noun_count  426.845024
1          char_count  411.270524
7           adv_count  249.409724
6           adj_count  248.462142
3      sentence_count  114.577411
2     avg_word_length   16.815967

Sample LFS1:
   verb_count  flesch_readability  adv_count  avg_word_length
0   -0.195475            0.135675   0.189242        -1.993353
1   -0.437571            0.332879  -0.357570         0.785218
2    0.046620           -0.043916   0.006971        -1.382977
3   -0.178183            0.193634   0.249998        -1.143381
4   -0.057135            0.290332  -0.296813        -1.221956


In [15]:
# Chunk 4: Word Embedding + Generate P1 and P2

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# --- 4.1: Prepare train/test split ---
X = df['processed']
y = df['Label'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4.2: Apply CountVectorizer (CV) ---
cv_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,2))
X_train_cv = cv_vectorizer.fit_transform(X_train)
X_test_cv = cv_vectorizer.transform(X_test)

# --- 4.3: Apply TF-IDF ---
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# --- 4.4: Train baseline classifier (Logistic Regression) ---
lr_cv = LogisticRegression(max_iter=1000)
lr_tfidf = LogisticRegression(max_iter=1000)

lr_cv.fit(X_train_cv, y_train)
lr_tfidf.fit(X_train_tfidf, y_train)

# --- 4.5: Evaluate accuracy ---
y_pred_cv = lr_cv.predict(X_test_cv)
y_pred_tfidf = lr_tfidf.predict(X_test_tfidf)

acc_cv = accuracy_score(y_test, y_pred_cv)
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)

print(f"CountVectorizer Accuracy (P2): {acc_cv:.4f}")
print(f"TF-IDF Accuracy (P1): {acc_tfidf:.4f}")

# --- 4.6: Decide better embedding ---
best_embedding = 'cv' if acc_cv > acc_tfidf else 'tfidf'
print(f"Better Word Embedding: {best_embedding.upper()}")

# Save P1 and P2
P1 = y_pred_tfidf
P2 = y_pred_cv


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CountVectorizer Accuracy (P2): 0.5270
TF-IDF Accuracy (P1): 0.5460
Better Word Embedding: TFIDF


In [None]:
# Chunk 5: Combine Best WE with LFS1, LFS2, LFS3 → Generate P3, P4, P5

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def get_best_model(X_train, X_test, y_train, y_test):
    models = {
        'SVM': SVC(kernel='linear', C=100, gamma=0.0001, probability=True),
        'NB': MultinomialNB(alpha=1.0, fit_prior=True),
        'KNN': KNeighborsClassifier(n_neighbors=2, weights='uniform'),
        'DT': DecisionTreeClassifier(max_features=None, criterion='gini', ccp_alpha=0.02),
        'Bagging': BaggingClassifier(n_estimators=100, bootstrap=True),
        'AdaBoost': AdaBoostClassifier(n_estimators=50, learning_rate=1)
    }

    best_acc = 0
    best_pred = None
    best_name = None
    for name, model in models.items():
        # Use StandardScaler if KNN
        if name == 'KNN':
            pipeline = make_pipeline(StandardScaler(with_mean=False), model)
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_pred = y_pred
            best_name = name

    return best_pred, best_name, best_acc


# 5.1: Re-split linguistic features to match text indices
X_train_text, X_test_text, lfs_train, lfs_test = train_test_split(
    df['processed'], lf_df, test_size=0.2, random_state=42)

# Use only the best embedding
if best_embedding == 'cv':
    X_embed_train = cv_vectorizer.transform(X_train_text)
    X_embed_test = cv_vectorizer.transform(X_test_text)
else:
    X_embed_train = tfidf_vectorizer.transform(X_train_text)
    X_embed_test = tfidf_vectorizer.transform(X_test_text)

# Prepare predictions P3, P4, P5
P = []
best_models = []

for i, lfs in enumerate([df_lfs1, df_lfs2, df_lfs3]):
    # Align split with current LFS
    lfs_train_part, lfs_test_part = train_test_split(lfs, test_size=0.2, random_state=42)
    
    # Combine LFS with embedding
    from scipy.sparse import hstack
    X_comb_train = hstack([X_embed_train, lfs_train_part])
    X_comb_test = hstack([X_embed_test, lfs_test_part])

    pred, best_model_name, best_model_acc = get_best_model(X_comb_train, X_comb_test, y_train, y_test)
    print(f"LFS{i+1} → Best Model: {best_model_name}, Accuracy: {best_model_acc:.4f}")
    P.append(pred)
    best_models.append(best_model_name)

P3, P4, P5 = P


# Chunk 6: Hard Voting on P3, P4, P5 → Output P6

import numpy as np
from scipy.stats import mode

# Stack predictions from LFS1, LFS2, LFS3
stacked_preds = np.vstack((P3, P4, P5)).T

# Apply hard voting (mode)
P6 = mode(stacked_preds, axis=1).mode.flatten()

# Evaluation of LFS-Enabled Voting (P6)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Evaluation of LFS-Enabled WE Voting Output (P6):")
print(f"Accuracy:  {accuracy_score(y_test, P6):.4f}")
print(f"Precision: {precision_score(y_test, P6):.4f}")
print(f"Recall:    {recall_score(y_test, P6):.4f}")
print(f"F1 Score:  {f1_score(y_test, P6):.4f}")


In [None]:
# Chunk 7: Final Hard Voting across P1 (TF-IDF), P2 (CV), and P6 (LFS Voting)

# Stack P1, P2, P6 predictions
final_stacked = np.vstack((P1, P2, P6)).T

# Apply final hard voting (mode)
final_prediction = mode(final_stacked, axis=1).mode.flatten()

# Final Evaluation of WELFake Model
print("Final Evaluation of WELFake (P1 + P2 + P6 Voting):")
print(f"Accuracy:  {accuracy_score(y_test, final_prediction):.4f}")
print(f"Precision: {precision_score(y_test, final_prediction):.4f}")
print(f"Recall:    {recall_score(y_test, final_prediction):.4f}")
print(f"F1 Score:  {f1_score(y_test, final_prediction):.4f}")


In [None]:
# Chunk 8.1: Download GloVe embeddings (if not already downloaded)
import os
import zipfile
import requests

glove_dir = "glove.6B"
glove_zip = "glove.6B.zip"
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"

if not os.path.exists(glove_dir):
    print("Downloading GloVe embeddings...")
    r = requests.get(glove_url)
    with open(glove_zip, "wb") as f:
        f.write(r.content)
    with zipfile.ZipFile(glove_zip, "r") as zip_ref:
        zip_ref.extractall(glove_dir)
    print("GloVe download complete.")
else:
    print("GloVe already exists.")

# Chunk 8.2: Prepare GloVe embedding matrix
embedding_index = {}
with open(f"{glove_dir}/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coeffs

embedding_dim = 100
word_index = tokenizer_cv.word_index
num_words = min(5000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


Downloading GloVe embeddings...


SSLError: HTTPSConnectionPool(host='downloads.cs.stanford.edu', port=443): Max retries exceeded with url: /nlp/data/glove.6B.zip (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)')))

In [None]:
# Chunk 8.3: Define CNN architecture for text classification
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalAveragePooling1D, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

input_layer = Input(shape=(max_seq_len,))
embedding_layer = Embedding(input_dim=num_words,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_seq_len,
                            trainable=False)(input_layer)

# Three Conv1D layers with kernel sizes 2, 3, 4
conv_2 = Conv1D(32, 2, activation="relu")(embedding_layer)
conv_3 = Conv1D(32, 3, activation="relu")(embedding_layer)
conv_4 = Conv1D(32, 4, activation="relu")(embedding_layer)

# Global average pooling
pool_2 = GlobalAveragePooling1D()(conv_2)
pool_3 = GlobalAveragePooling1D()(conv_3)
pool_4 = GlobalAveragePooling1D()(conv_4)

# Concatenate all pooled outputs
concat = Concatenate()([pool_2, pool_3, pool_4])

# Final Dense layers
dense1 = Dense(64, activation="relu")(concat)
output = Dense(1, activation="sigmoid")(dense1)

cnn_model = Model(inputs=input_layer, outputs=output)
cnn_model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
cnn_model.summary()


# Chunk 8.4: Train CNN model
history_cnn = cnn_model.fit(X_train_seq, y_train,
                            epochs=5,
                            batch_size=32,
                            validation_data=(X_test_seq, y_test))


# Chunk 8.5: Evaluate CNN performance
y_pred_cnn = (cnn_model.predict(X_test_seq) > 0.5).astype(int)

print("CNN Evaluation Metrics:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_cnn):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_cnn):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_cnn):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_cnn):.4f}")


In [None]:
# Chunk 9.2: Load BERT tokenizer and encode dataset
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def bert_encode(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )

X_train_bert = bert_encode(X_train, bert_tokenizer)
X_test_bert = bert_encode(X_test, bert_tokenizer)


In [None]:
# Chunk 9.3: Build BERT model with 5 dense and 3 dropout layers
from transformers import TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input

bert_model = TFBertModel.from_pretrained("bert-base-uncased")

input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")

bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]  # [CLS] token

x = Dense(256, activation="relu")(bert_output)
x = Dropout(0.3)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(32, activation="relu")(x)
x = Dropout(0.2)(x)
output = Dense(1, activation="sigmoid")(x)

final_bert_model = Model(inputs=[input_ids, attention_mask], outputs=output)

final_bert_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

final_bert_model.summary()


# Chunk 9.4: Train BERT model
history_bert = final_bert_model.fit(
    x={"input_ids": X_train_bert["input_ids"], "attention_mask": X_train_bert["attention_mask"]},
    y=y_train,
    validation_data=(
        {"input_ids": X_test_bert["input_ids"], "attention_mask": X_test_bert["attention_mask"]},
        y_test,
    ),
    epochs=3,
    batch_size=16
)

# Chunk 9.5: Evaluate BERT model
y_pred_bert_probs = final_bert_model.predict(
    {"input_ids": X_test_bert["input_ids"], "attention_mask": X_test_bert["attention_mask"]}
)
y_pred_bert = (y_pred_bert_probs > 0.5).astype(int)

print("BERT Evaluation Metrics:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_bert):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_bert):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_bert):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_bert):.4f}")


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Chunk 10.1: Store TF-IDF, CV, and LFS-Voting (P1, P2, P6)

# P1: TF-IDF-based best model prediction
P1 = y_pred_tfidf

# P2: CV-based best model prediction
P2 = y_pred_cv

# P6: Voting over LFS1+CV, LFS2+CV, LFS3+CV (already done earlier)
P6 = y_pred_vote_lfs


# Chunk 10.2: Final voting among P1, P2, and P6

import numpy as np
from scipy.stats import mode

# Stack predictions into a (n_samples, 3) matrix
final_preds_matrix = np.vstack((P1, P2, P6)).T

# Take mode across axis=1 (row-wise) for hard voting
final_prediction = mode(final_preds_matrix, axis=1)[0].flatten()


# Chunk 10.3: Evaluate Final WELFake Voting Model
print("Final WELFake Prediction Performance:")
print(f"Accuracy:  {accuracy_score(y_test, final_prediction):.4f}")
print(f"Precision: {precision_score(y_test, final_prediction):.4f}")
print(f"Recall:    {recall_score(y_test, final_prediction):.4f}")
print(f"F1 Score:  {f1_score(y_test, final_prediction):.4f}")
