In [4]:
# Chunk 1: Environment Setup and Data Loading

# --- 1.1: Install required libraries (uncomment if running first time) ---
# !pip install pandas numpy matplotlib seaborn nltk tqdm tensorflow scikit-learn spacy 
!pip install transformers datasets textstat textblob

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting textstat
  Downloading textstat-0.7.7-py3-none-any.whl.metadata (15 kB)
Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64

In [2]:


# --- 1.2: Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import nltk
from sklearn.model_selection import train_test_split

In [3]:
# --- 1.3: Load Dataset ---
# Replace with your actual path if local file is available

df = pd.read_csv('final_data.csv')
# --- 1.4: Quick View of Dataset ---
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (99531, 6)


Unnamed: 0,title,text,Label,text_length,word_count,title_length
0,donald trump sends out embarrassing new year e...,donald trump just could not wish all american ...,0,2283,385,72
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,0,1673,248,68
2,sheriff david clarke becomes internet joke for...,friday wa revealed that former milwaukee sheri...,0,2643,422,78
3,trump obsessed even ha obama name coded into h...,christmas day donald trump announced that woul...,0,2095,338,62
4,pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...,0,1990,332,69


In [4]:
# Chunk 2: Data Preprocessing

import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm



# --- 2.1: Text Cleaning Function ---
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 2.2: Stopword Removal and Lemmatization ---
# Define custom list of important words to preserve
important_words = {
    'not', 'no', 'never', 'nothing', 'nowhere', 'none', 'nobody',
    'would', 'could', 'should', 'will', 'was', 'is', 'are',
    'you', 'we', 'he', 'they', 'your', 'his', 'her', 'their'
}

# Load NLTK stopwords and remove important words
stop_words = set(stopwords.words('english')) - important_words
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Clean
    text = clean_text(text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    processed = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(processed)

# --- 2.3: Apply Preprocessing to the Dataset ---

# Display shape before merging columns
print("Shape before merging columns:")
print(df.shape)

# Fill missing text/title values with empty strings (handles NaN)
if {'title', 'text'}.issubset(df.columns):
    df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
else:
    df['content'] = df['text'].fillna('')

# Display shape after merging columns
print("\nShape after merging columns :")
print(df.shape)

# Remove null values and values with less than 20 characters
df = df.dropna(subset=['content'])
df = df[df['content'].str.len() >= 20]
# Display shape after removing null and short values
print("\nShape after removing null and short values :")
print(df.shape)

# Drop duplicate content rows if any
df.drop_duplicates(subset='content', inplace=True)

# Display shape after removing duplicates
print("\nShape after removing duplicates :")
print(df.shape)


# Show a sample before preprocessing
print("Before preprocessing:")
print(df['content'].iloc[0])

# Add tqdm progress bar for preprocessing
tqdm.pandas()
df['processed'] = df['content'].progress_apply(preprocess_text)

# Show the result
print("\nAfter preprocessing:")
print(df['processed'].iloc[0])



Shape before merging columns:
(99531, 6)

Shape after merging columns :
(99531, 7)

Shape after removing null and short values :
(99531, 7)

Shape after removing duplicates :
(61450, 7)
Before preprocessing:
donald trump sends out embarrassing new year eve message this disturbing donald trump just could not wish all american happy new year and leave that instead had give shout out his enemy hater and the very dishonest fake news medium the former reality show star had just one job and could not our country rapidly grows stronger and smarter want wish all friend supporter enemy hater and even the very dishonest fake news medium happy and healthy new year president angry pant tweeted 2018 will great year for america our country rapidly grows stronger and smarter want wish all friend supporter enemy hater and even the very dishonest fake news medium happy and healthy new year 2018 will great year for america donald trump december 2017trump tweet went down about welll you expect what kind 

100%|██████████| 61450/61450 [01:19<00:00, 768.73it/s] 


After preprocessing:
donald trump sends embarrassing new year eve message disturbing donald trump could not wish american happy new year leave instead give shout his enemy hater dishonest fake news medium former reality show star one job could not country rapidly grows stronger smarter want wish friend supporter enemy hater even dishonest fake news medium happy healthy new year president angry pant tweeted will great year america country rapidly grows stronger smarter want wish friend supporter enemy hater even dishonest fake news medium happy healthy new year will great year america donald trump december trump tweet went welll you expect kind president sends new year greeting like despicable petty infantile gibberish trump his lack decency will not even allow rise gutter long enough wish american citizen happy new year bishop talbert swan december no one like you calvin december your impeachment would make great year america will also accept regaining control congress miranda yaver d




In [5]:
# count the number of rows with Label 0 and 1
label_0_count = len(df[df['Label'] == 0])
label_1_count = len(df[df['Label'] == 1])

print(f"Count of Label 0: {label_0_count}")
print(f"Count of Label 1: {label_1_count}")

Count of Label 0: 30642
Count of Label 1: 30808


In [7]:
import spacy
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
import pandas as pd
import numpy as np
import spacy
import textstat
from tqdm import tqdm
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_selection import SelectKBest
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# --- Full Feature Extraction Function ---
def extract_linguistic_features(row):
    text = row['processed']
    title = row['title'] if 'title' in row else ""

    doc = nlp(text)
    num_words = len([token for token in doc if token.is_alpha])
    num_chars = sum(len(token.text) for token in doc if token.is_alpha)
    avg_word_length = np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0
    num_sentences = len(list(doc.sents))

    num_nouns = len([token for token in doc if token.pos_ == "NOUN"])
    num_verbs = len([token for token in doc if token.pos_ == "VERB"])
    num_adj = len([token for token in doc if token.pos_ == "ADJ"])
    num_adv = len([token for token in doc if token.pos_ == "ADV"])
    num_dets = len([token for token in doc if token.pos_ == "DET"])
    num_ents = len(doc.ents)

    num_caps = sum(1 for token in doc if token.text.isupper())
    num_articles = sum(1 for token in doc if token.text.lower() in {"a", "an", "the"})
    num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))

    sent_lengths = [len([token for token in sent if token.is_alpha]) for sent in doc.sents]
    num_short_sent = sum(1 for l in sent_lengths if l <= 5)
    num_long_sent = sum(1 for l in sent_lengths if l >= 20)

    flesch_read = textstat.flesch_reading_ease(text)
    gunning_fog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)

    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    syllables = textstat.syllable_count(text)

    if title:
        title_vec = nlp(title).vector
        text_vec = doc.vector
        
        # Check if vectors have at least one feature
        if len(title_vec) > 0 and len(text_vec) > 0:
            sim = cosine_similarity([title_vec], [text_vec])[0][0]
        else:
            sim = 0.0  # or any other default value
    else:
        sim = 0.0

    adj_adv_rate = (num_adj + num_adv) / num_words if num_words else 0
    words_per_sent = num_words / num_sentences if num_sentences else 0

    return [
        num_special_chars, num_dets, num_caps, num_short_sent, num_long_sent,
        gunning_fog, smog, ari,
        polarity, sim, subjectivity,
        syllables, num_words, adj_adv_rate, words_per_sent,
        num_articles, num_verbs, num_sentences,
        num_adj, num_adv
    ]

lf_columns = [
    'special_chars', 'determiners', 'capital_letters', 'short_sent', 'long_sent',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'syllables', 'word_count', 'adj_adv_rate', 'words_per_sentence',
    'articles', 'verbs', 'sentences',
    'adjectives', 'adverbs'
]

# Extract features
tqdm.pandas()
linguistic_features = df.progress_apply(extract_linguistic_features, axis=1)
lf_df = pd.DataFrame(linguistic_features.tolist(), columns=lf_columns)

# --- Pearson Correlation Filtering ---
cor_matrix = lf_df.corr().abs()
upper = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
lf_df_filtered = lf_df.drop(columns=to_drop)

print(f"Removed correlated features: {to_drop}")
print(f"Remaining features: {lf_df_filtered.columns.tolist()}")



100%|██████████| 61450/61450 [28:46<00:00, 35.58it/s]


Removed correlated features: ['gunning_fog', 'smog', 'ari', 'syllables', 'word_count', 'verbs', 'sentences', 'adjectives', 'adverbs']
Remaining features: ['special_chars', 'determiners', 'capital_letters', 'short_sent', 'long_sent', 'polarity', 'title_similarity', 'subjectivity', 'adj_adv_rate', 'words_per_sentence', 'articles']


KeyError: "['gunning_fog', 'adjectives'] not in index"

In [10]:
# --- Manual LFS Grouping (Table V) ---
LFS1 = lf_df_filtered[['special_chars', 'short_sent', 'long_sent',  'polarity', 'title_similarity', 'subjectivity', 'adj_adv_rate', 'articles']]
LFS2 = lf_df_filtered[['determiners', 'short_sent', 'long_sent',  'polarity', 'title_similarity', 'subjectivity']]
LFS3 = lf_df_filtered[['special_chars', 'capital_letters', 'polarity', 'title_similarity', 'subjectivity', 'words_per_sentence']]

# --- Normalize ---
scaler = StandardScaler()
df_lfs1 = pd.DataFrame(scaler.fit_transform(LFS1), columns=LFS1.columns)
df_lfs2 = pd.DataFrame(scaler.fit_transform(LFS2), columns=LFS2.columns)
df_lfs3 = pd.DataFrame(scaler.fit_transform(LFS3), columns=LFS3.columns)

# --- Preview ---
print("\nSample LFS1:")
print(df_lfs1.head())


Sample LFS1:
   special_chars  short_sent  long_sent  polarity  title_similarity  \
0            0.0   -0.215846  -0.542159  0.570270          0.748534   
1            0.0   -0.215846  -0.542159 -0.721587          0.491445   
2            0.0    3.360243   0.762522 -0.039679          0.450214   
3            0.0   -0.215846  -0.542159 -0.798724         -0.577765   
4            0.0   -0.215846   0.762522 -0.381383          0.405477   

   subjectivity  adj_adv_rate  articles  
0      1.692724      1.390719       0.0  
1     -0.592174      0.994764       0.0  
2      1.163485     -0.457139       0.0  
3      0.128123     -0.489694       0.0  
4      0.754988     -1.141160       0.0  


In [13]:
# Chunk 4: Word Embedding + Generate P1 and P2 + Combine with LFS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
import pandas as pd

# --- 4.1: Prepare train/test split ---
X = df['processed']
y = df['Label'].astype(int)
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4.2: Apply CountVectorizer (CV) ---
cv_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_cv = cv_vectorizer.fit_transform(X_train_text)
X_test_cv = cv_vectorizer.transform(X_test_text)

# --- 4.3: Apply TF-IDF ---
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# --- 4.4: Train baseline classifier (Logistic Regression) ---
lr_cv = LogisticRegression(max_iter=600)
lr_tfidf = LogisticRegression(max_iter=600)

lr_cv.fit(X_train_cv, y_train)
lr_tfidf.fit(X_train_tfidf, y_train)

# --- 4.5: Evaluate accuracy ---
y_pred_cv = lr_cv.predict(X_test_cv)
y_pred_tfidf = lr_tfidf.predict(X_test_tfidf)

acc_cv = accuracy_score(y_test, y_pred_cv)
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)

print(f"CountVectorizer Accuracy (P2): {acc_cv:.4f}")
print(f"TF-IDF Accuracy (P1): {acc_tfidf:.4f}")

# --- 4.6: Decide better embedding ---
best_embedding = 'cv' if acc_cv > acc_tfidf else 'tfidf'
print(f"Better Word Embedding: {best_embedding.upper()}")

# Save P1 and P2
P1 = y_pred_tfidf
P2 = y_pred_cv


# --- 4.7: Combine CV with LFS1, LFS2, LFS3 ---
# Split LFSs into train/test sets
train_idx, test_idx = train_test_split(df_lfs1.index, test_size=0.2, random_state=42)
df_lfs1_train, df_lfs1_test = df_lfs1.loc[train_idx], df_lfs1.loc[test_idx]
df_lfs2_train, df_lfs2_test = df_lfs2.loc[train_idx], df_lfs2.loc[test_idx]
df_lfs3_train, df_lfs3_test = df_lfs3.loc[train_idx], df_lfs3.loc[test_idx]
# Convert to numpy
LFS1_train = df_lfs1_train.values
LFS1_test = df_lfs1_test.values
LFS2_train = df_lfs2_train.values
LFS2_test = df_lfs2_test.values
LFS3_train = df_lfs3_train.values
LFS3_test = df_lfs3_test.values

# Choose the best embedding
if best_embedding == 'cv':
    X_train_best = X_train_cv
    X_test_best = X_test_cv
else:
    X_train_best = X_train_tfidf
    X_test_best = X_test_tfidf

# Concatenate best embedding with LFS features
X_train_comb1 = hstack([X_train_best, LFS1_train])
X_test_comb1 = hstack([X_test_best, LFS1_test])
X_train_comb2 = hstack([X_train_best, LFS2_train])
X_test_comb2 = hstack([X_test_best, LFS2_test])
X_train_comb3 = hstack([X_train_best, LFS3_train])
X_test_comb3 = hstack([X_test_best, LFS3_test])

# Train classifiers
clf1 = LogisticRegression(max_iter=600)
clf2 = LogisticRegression(max_iter=600)
clf3 = LogisticRegression(max_iter=600)

clf1.fit(X_train_comb1, y_train)
clf2.fit(X_train_comb2, y_train)
clf3.fit(X_train_comb3, y_train)

# Predict
y_pred_comb1 = clf1.predict(X_test_comb1)
y_pred_comb2 = clf2.predict(X_test_comb2)
y_pred_comb3 = clf3.predict(X_test_comb3)

# Evaluate
acc_comb1 = accuracy_score(y_test, y_pred_comb1)
acc_comb2 = accuracy_score(y_test, y_pred_comb2)
acc_comb3 = accuracy_score(y_test, y_pred_comb3)

# --- 4.8: Store all accuracies ---
accuracy_results = {
    "TF-IDF only (P1)": acc_tfidf,
    "CountVectorizer only (P2)": acc_cv,
    f"{best_embedding.upper()} + LFS1": acc_comb1,
    f"{best_embedding.upper()} + LFS2": acc_comb2,
    f"{best_embedding.upper()} + LFS3": acc_comb3
}

print("\n--- Accuracy Summary ---")
for k, v in accuracy_results.items():
    print(f"{k}: {v:.4f}")

CountVectorizer Accuracy (P2): 0.9171
TF-IDF Accuracy (P1): 0.9262
Better Word Embedding: TFIDF

--- Accuracy Summary ---
TF-IDF only (P1): 0.9262
CountVectorizer only (P2): 0.9171
TFIDF + LFS1: 0.9290
TFIDF + LFS2: 0.9289
TFIDF + LFS3: 0.9279
