In [1]:
import pandas as pd

df = pd.read_csv("../data/train.csv", sep=";", quotechar='"', engine='python')
df.drop('Unnamed: 0', axis=1, inplace=True)
df_test = pd.read_csv("../data/test.csv", sep=";", quotechar='"', engine='python')
df_test.drop('Unnamed: 0', axis=1, inplace=True)

df = pd.concat([df, df_test], ignore_index=True)

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32470 entries, 0 to 32469
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   32470 non-null  object
 1   text    32470 non-null  object
 2   label   32470 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 761.1+ KB
None


Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [3]:
# Check for missing values
print(df.isnull().sum())

title    0
text     0
label    0
dtype: int64


In [5]:
import re
import string

# Clean text function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\[.*?\]', '', text) # Remove text in [], like citations and sources
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Replace newlines with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text


df['clean_title'] = df['title'].apply(clean_text)
df['clean_text'] = df['text'].apply(clean_text)
df['combined'] = df['clean_title'] + ' ' + df['clean_text']

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (32470, 6)


Unnamed: 0,title,text,label,clean_title,clean_text,combined
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1,palestinians switch off christmas lights in be...,ramallah west bank reuters palestinians switch...,palestinians switch off christmas lights in be...
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1,china says trump call with taiwan president wo...,beijing reuters us presidentelect donald trump...,china says trump call with taiwan president wo...
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0,fail the trump organization’s credit score wil...,while the controversy over trump s personal ta...,fail the trump organization’s credit score wil...
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1,zimbabwe military chiefs china trip was normal...,beijing reuters a trip to beijing last week by...,zimbabwe military chiefs china trip was normal...
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0,the most uncourageous president ever receives ...,there has never been a more uncourageous perso...,the most uncourageous president ever receives ...


In [6]:
def top_word_freq_ratio(text):
    word_map = {}
    words = text.split()
    for word in words:
        word_map[word] = word_map.get(word, 0) + 1
    if not word_map:
        return 0
    max_freq = max(word_map.values())
    return max_freq / len(words)

df['title_max_word_ratio'] = df['clean_title'].apply(top_word_freq_ratio)
df['text_max_word_ratio'] = df['clean_text'].apply(top_word_freq_ratio)


In [7]:
import heapq

def top_k_word_lengths(text, k=5):
    words = text.split()
    word_lengths = [len(w) for w in words]
    top_k = heapq.nlargest(k, word_lengths)
    return sum(top_k) / k if top_k else 0

df['title_topk_word_len'] = df['clean_title'].apply(lambda x: top_k_word_lengths(x, k=5))
df.head()

Unnamed: 0,title,text,label,clean_title,clean_text,combined,title_max_word_ratio,text_max_word_ratio,title_topk_word_len
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1,palestinians switch off christmas lights in be...,ramallah west bank reuters palestinians switch...,palestinians switch off christmas lights in be...,0.2,0.071429,9.2
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1,china says trump call with taiwan president wo...,beijing reuters us presidentelect donald trump...,china says trump call with taiwan president wo...,0.090909,0.068966,6.8
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0,fail the trump organization’s credit score wil...,while the controversy over trump s personal ta...,fail the trump organization’s credit score wil...,0.1,0.053672,7.0
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1,zimbabwe military chiefs china trip was normal...,beijing reuters a trip to beijing last week by...,zimbabwe military chiefs china trip was normal...,0.1,0.044811,7.0
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0,the most uncourageous president ever receives ...,there has never been a more uncourageous perso...,the most uncourageous president ever receives ...,0.142857,0.06051,10.4


In [8]:
def text_topk_word_lengths(text, k=5):
    words = text.split()
    word_lengths = [len(w) for w in words]
    top_k = heapq.nlargest(k, word_lengths)
    return sum(top_k) / k if top_k else 0

def text_max_word_ratio(text):
    word_map = {}
    words = text.split()
    for word in words:
        word_map[word] = word_map.get(word, 0) + 1
    return max(word_map.values()) / len(words) if words else 0

df['text_topk_word_len'] = df['clean_text'].apply(text_topk_word_lengths)
df['text_max_word_ratio'] = df['clean_text'].apply(text_max_word_ratio)


In [9]:
# Check for missing values
print(df.isnull().sum())

title                   0
text                    0
label                   0
clean_title             0
clean_text              0
combined                0
title_max_word_ratio    0
text_max_word_ratio     0
title_topk_word_len     0
text_topk_word_len      0
dtype: int64


In [15]:
empty_rows = df[
    df['clean_title'].str.strip().eq('') & 
    df['clean_text'].str.strip().eq('')
]

print(f"Number of fully empty rows after cleaning: {len(empty_rows)}")


Number of fully empty rows after cleaning: 3


In [17]:
df = df.drop(empty_rows.index).reset_index(drop=True)

In [19]:
def jaccard_similarity(s1, s2):
    a = set(s1.split())
    b = set(s2.split())
    return len(a & b) / len(a | b)

df['jaccard_title_text'] = df.apply(lambda row: jaccard_similarity(row['clean_title'], row['clean_text']), axis=1)
df.head()

Unnamed: 0,title,text,label,clean_title,clean_text,combined,title_max_word_ratio,text_max_word_ratio,title_topk_word_len,text_topk_word_len,jaccard_title_text
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1,palestinians switch off christmas lights in be...,ramallah west bank reuters palestinians switch...,palestinians switch off christmas lights in be...,0.2,0.071429,9.2,14.8,0.055118
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1,china says trump call with taiwan president wo...,beijing reuters us presidentelect donald trump...,china says trump call with taiwan president wo...,0.090909,0.068966,6.8,11.6,0.132075
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0,fail the trump organization’s credit score wil...,while the controversy over trump s personal ta...,fail the trump organization’s credit score wil...,0.1,0.053672,7.0,14.6,0.021053
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1,zimbabwe military chiefs china trip was normal...,beijing reuters a trip to beijing last week by...,zimbabwe military chiefs china trip was normal...,0.1,0.044811,7.0,12.8,0.033755
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0,the most uncourageous president ever receives ...,there has never been a more uncourageous perso...,the most uncourageous president ever receives ...,0.142857,0.06051,10.4,14.4,0.034483


In [21]:
# Insertion Sort for small lists
def insertion_sort(arr):
    for i in range(1, len(arr)):
        key = arr[i]
        j = i - 1
        while j >= 0 and len(arr[j]) < len(key):
            arr[j + 1] = arr[j]
            j -= 1
        arr[j + 1] = key
    return arr

# Feature: longest punctuation run using insertion sort
def longest_punct_run(text):
    if not isinstance(text, str): return 0
    runs = re.findall(r"[!?\.]{2,}", text)  # only runs, not single marks
    if not runs: return 0
    sorted_runs = insertion_sort(runs)
    return len(sorted_runs[0])

df['title_punct_emphasis'] = df['title'].apply(longest_punct_run)

In [23]:
def merge_sort(arr):
    if len(arr) <= 1:
        return arr
    mid = len(arr) // 2
    left = merge_sort(arr[:mid])
    right = merge_sort(arr[mid:])
    return merge(left, right)

def merge(left, right):
    result = []
    while left and right:
        if len(left[0]) > len(right[0]):  # sort by word length descending
            result.append(left.pop(0))
        else:
            result.append(right.pop(0))
    result.extend(left or right)
    return result

def longest_cap_word(text):
    if not isinstance(text, str): return 0
    words = [w for w in text.split() if w.isupper()]
    if not words: return 0
    sorted_words = merge_sort(words)
    return len(sorted_words[0])


df['title_longest_cap_word'] = df['title'].apply(longest_cap_word)


In [25]:
clickbait_phrases = [
    "breaking", "shocking", "you won’t believe", "exclusive",
    "what happened next", "unbelievable", "reasons why", "top",
    "this will blow your mind", "secret", "surprising"
]


In [27]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False

class ClickbaitTrie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, phrase):
        node = self.root
        for word in phrase.split():
            if word not in node.children:
                node.children[word] = TrieNode()
            node = node.children[word]
        node.is_end = True

    def starts_with_clickbait(self, text):
        node = self.root
        words = text.lower().split()
        for i in range(len(words)):
            curr = node
            for j in range(i, len(words)):
                word = words[j]
                if word not in curr.children:
                    break
                curr = curr.children[word]
                if curr.is_end:
                    return True
        return False


In [29]:
# Initialize and build the Trie
cb_trie = ClickbaitTrie()
for phrase in clickbait_phrases:
    cb_trie.insert(phrase)

# Apply to clean_title
df['clickbait_score'] = df['clean_title'].apply(lambda title: int(cb_trie.starts_with_clickbait(title)))

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


X_feats = df[[
    'clickbait_score',
    'title_topk_word_len', 'text_topk_word_len',
    'title_max_word_ratio', 'text_max_word_ratio',
    'jaccard_title_text',
    'title_punct_emphasis', 'title_longest_cap_word'
]]

y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X_feats, y, stratify=y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.84      0.90      2972
           1       0.88      0.97      0.92      3522

    accuracy                           0.91      6494
   macro avg       0.92      0.91      0.91      6494
weighted avg       0.91      0.91      0.91      6494



In [32]:
import joblib

joblib.dump(clf, "../model/dsa_rf_model.pkl")  # Flask will load this file
print("Model saved to dsa_rf_model.pkl")

Model saved to dsa_rf_model.pkl


In [35]:
import time
import joblib
from sklearn.metrics import accuracy_score, f1_score
import sys
import os
import psutil

# Timing: Training
start_train = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start_train

# Timing: Inference
start_infer = time.time()
y_pred = clf.predict(X_test)
infer_time = (time.time() - start_infer) / len(X_test)

# Accuracy & F1
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Size in MB
joblib.dump(clf, "../model/model_temp1.pkl")
model_size = os.path.getsize("../model/model_temp1.pkl") / 1e6

# Output
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Training Time: {train_time:.2f} sec")
print(f"Inference Time per sample: {infer_time:.6f} sec")
print(f"Model Size: {model_size:.2f} MB")


Accuracy: 0.9102
F1 Score: 0.9211
Training Time: 2.05 sec
Inference Time per sample: 0.000009 sec
Model Size: 10.32 MB


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['combined'])  
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import time
import joblib
from sklearn.metrics import accuracy_score, f1_score
import sys
import os
import psutil

# Timing: Training
start_train = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start_train

# Timing: Inference
start_infer = time.time()
y_pred = clf.predict(X_test)
infer_time = (time.time() - start_infer) / X_test.shape[0]

# Accuracy & F1
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Size in MB
joblib.dump(clf, "../model/model_temp.pkl")
model_size = os.path.getsize("../model/model_temp.pkl") / 1e6

# Output
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Training Time: {train_time:.2f} sec")
print(f"Inference Time per sample: {infer_time:.6f} sec")
print(f"Model Size: {model_size:.2f} MB")


In [21]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch

# Prepare data
df_bert = df[['combined', 'label']].rename(columns={'combined': 'text', 'label': 'label'})
train_df, test_df = train_test_split(df_bert, stratify=df_bert['label'], test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True).remove_columns("text")
test_dataset = test_dataset.map(tokenize, batched=True).remove_columns("text")
train_dataset.set_format("torch")
test_dataset.set_format("torch")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


training_args = TrainingArguments(
    output_dir="./bert_checkpoints",          # <--- safe writable path
    logging_dir="./bert_logs",                # <--- safe writable path
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=10,
    do_eval=True,
    overwrite_output_dir=True,                # avoids any read-only write clashes
    save_strategy="no"                        # avoids saving checkpoints
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.evaluate()


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 25973/25973 [01:50<00:00, 234.83 examples/s]
Map: 100%|██████████| 6494/6494 [00:29<00:00, 223.81 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.543
20,0.1993
30,0.1368
40,0.3045
50,0.2955
60,0.1441
70,0.2179
80,0.0213
90,0.0105
100,0.0821


{'eval_loss': 0.044218759983778,
 'eval_runtime': 324.8474,
 'eval_samples_per_second': 19.991,
 'eval_steps_per_second': 2.5,
 'epoch': 2.0}

In [24]:
import os
import time
import torch
import joblib
import pandas as pd
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("train.csv", sep=";", quotechar='"', engine="python")
df['combined'] = df['title'].astype(str) + " " + df['text'].astype(str)

df_bert = df[['combined', 'label']].rename(columns={'combined': 'text', 'label': 'label'})
_, test_df = train_test_split(df_bert, stratify=df_bert['label'], test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True)

test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
test_dataset = test_dataset.map(tokenize, batched=True).remove_columns("text")
test_dataset.set_format("torch")


model = BertForSequenceClassification.from_pretrained("bert_output/checkpoint-500")


training_args = TrainingArguments(
    output_dir="./tmp",
    per_device_eval_batch_size=8,
    no_cuda=True
)

trainer = Trainer(
    model=model,
    args=training_args
)


start_time = time.time()
predictions = trainer.predict(test_dataset)
infer_total_time = time.time() - start_time
infer_time_per_sample = infer_total_time / len(test_dataset)


y_pred = predictions.predictions.argmax(-1)
y_true = predictions.label_ids
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)


model_path = "bert_output/checkpoint-500"
model_files = [os.path.join(model_path, f) for f in os.listdir(model_path) if f.endswith((".bin", ".safetensors"))]
model_size_mb = sum(os.path.getsize(f) for f in model_files) / 1e6

param_count = sum(p.numel() for p in model.parameters())


train_time_sec = None
trainer_state_path = os.path.join(model_path, "trainer_state.json")
if os.path.exists(trainer_state_path):
    import json
    with open(trainer_state_path, "r") as f:
        state = json.load(f)
        if "log_history" in state:
            for log in state["log_history"]:
                if "train_runtime" in log:
                    train_time_sec = log["train_runtime"]


print(f"Accuracy:        {acc:.4f}")
print(f"F1 Score:        {f1:.4f}")
print(f"Inference Time:  {infer_time_per_sample:.4f} sec/sample")
if train_time_sec:
    print(f"Training Time:   {train_time_sec:.2f} sec")
else:
    print("Training Time:   Not available")
print(f"Model Size:      {model_size_mb:.2f} MB")
print(f"Parameter Count: {param_count / 1e6:.1f}M")


Map: 100%|██████████| 4871/4871 [00:20<00:00, 241.19 examples/s]


Accuracy:        0.9795
F1 Score:        0.9809
Inference Time:  0.1232 sec/sample
Training Time:   Not available
Model Size:      437.96 MB
Parameter Count: 109.5M
