In [None]:
from google.colab import drive
drive.mount('/content/drive')

import joblib
from transformers import AutoModelForSequenceClassification, AutoTokenizer

project_path = "/content/drive/MyDrive/Fake News Project"

# Load classical models
lr_bfk = joblib.load(project_path + "/lr_bharat.pkl")
svm_bfk = joblib.load(project_path + "/svm_bharat.pkl")
tfidf_bfk = joblib.load(project_path + "/tfidf_bharat.pkl")

# Load mBERT
model_bfk = AutoModelForSequenceClassification.from_pretrained(project_path + "/mbert_bharat_model")
tokenizer = AutoTokenizer.from_pretrained(project_path + "/mbert_bharat_model")

print("Models loaded successfully!")

In [None]:
# ---------------------------------------------------------
# MOUNT GOOGLE DRIVE
# ---------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ---------------------------------------------------------
# IMPORT REQUIRED LIBRARIES
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION (USED BY ALL DATASETS)
# ---------------------------------------------------------

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)              # remove URLs
    text = re.sub(r"[^a-zA-Z\u0900-\u097F ]", "", text)  # keep English + Hindi chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
liar_path = "/content/drive/MyDrive/Fake News Project/LIAR/train.tsv"

column_names = [
    "id","label","statement","subject","speaker","speaker_job","state","party",
        "barely_true_counts","false_counts","half_true_counts","mostly_true_counts",
            "pants_on_fire_counts","context"
            ]
df = pd.read_csv(liar_path, sep="\t", header=None, names=column_names)
print(df[["label","statement"]].head())


In [None]:
df["clean_text"] = df["statement"].apply(clean_text)

In [None]:
tfidf_liar = TfidfVectorizer(max_features=3000)
X_liar = tfidf_liar.fit_transform(df["clean_text"])
y_liar = df["label"]

print("LIAR TF-IDF Shape:", X_liar.shape)

In [None]:
# ---------------------------------------------------------
# LOAD FAKEDDIT DATASET
# ---------------------------------------------------------
fakeddit_path = "/content/drive/MyDrive/Fake News Project/Fakeddit/multimodal_train.tsv"

fakeddit = pd.read_csv(fakeddit_path, sep="\t")

print("Fakeddit Loaded:", fakeddit.shape)
print(fakeddit.head())

In [None]:
# Cleaned text
fakeddit["clean_text"] = fakeddit["clean_title"].apply(clean_text)

In [None]:
y_fakeddit = fakeddit["6_way_label"]
print("Unique Fakeddit 6-class labels:", y_fakeddit.unique())

In [None]:
tfidf_fakeddit = TfidfVectorizer(max_features=3000)
X_fakeddit = tfidf_fakeddit.fit_transform(fakeddit["clean_text"])

print("Fakeddit TF-IDF shape:", X_fakeddit.shape)

In [None]:
# ---------------------------------------------------------
# LOAD BHARATFAKENEWSKOSH
# ---------------------------------------------------------

bfk_path = "/content/drive/MyDrive/Fake News Project/BharatFakeNewsKosh/BharatFakeNewsKosh.xlsx"

bfk = pd.read_excel(bfk_path)

print("BharatFakeNewsKosh Loaded:", bfk.shape)
bfk.head()

In [None]:
print("\nLanguages:", bfk["Language"].unique())
print("\nLabels:", bfk["Label"].unique())

In [None]:
# Use English translation of the statement
bfk["clean_text"] = bfk["Eng_Trans_Statement"].apply(clean_text)

print("\nCleaned Sample:")
print(bfk[["Eng_Trans_Statement", "clean_text"]].head())

In [None]:
tfidf_bfk = TfidfVectorizer(max_features=3000)
X_bfk = tfidf_bfk.fit_transform(bfk["clean_text"])

y_bfk = bfk["Label"]

print("Multilingual TF-IDF Shape:", X_bfk.shape)

In [None]:
# ---------------------------------------------------------
# TRAIN/TEST SPLITS FOR ALL 3 DATASETS
# ---------------------------------------------------------

# 1️⃣ LIAR Dataset (Truthfulness Classification)
X_train_liar, X_test_liar, y_train_liar, y_test_liar = train_test_split(
    X_liar, y_liar, test_size=0.2, random_state=42
)
print("LIAR Train/Test Split:")
print("Train:", X_train_liar.shape)
print("Test:", X_test_liar.shape)
print("\n")

# 2️⃣ Fakeddit Dataset (Fake News Type Classification)
X_train_fak, X_test_fak, y_train_fak, y_test_fak = train_test_split(
    X_fakeddit, y_fakeddit, test_size=0.2, random_state=42
)
print("Fakeddit Train/Test Split:")
print("Train:", X_train_fak.shape)
print("Test:", X_test_fak.shape)
print("\n")

# 3️⃣ BharatFakeNewsKosh (Multilingual Fake News Classification)
X_train_bfk, X_test_bfk, y_train_bfk, y_test_bfk = train_test_split(
    X_bfk, y_bfk, test_size=0.2, random_state=42
)
print("BharatFakeNewsKosh Train/Test Split:")
print("Train:", X_train_bfk.shape)
print("Test:", X_test_bfk.shape)

In [None]:
print(df["label"].unique())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create the model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train_liar, y_train_liar)

# Predict on test data
y_pred_lr = lr_model.predict(X_test_liar)

# Show results
print("Accuracy:", accuracy_score(y_test_liar, y_pred_lr))
print("\nClassification Report:\n")
print(classification_report(y_test_liar, y_pred_lr))


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Create SVM model
svm_model = LinearSVC(max_iter=5000)

# Train the model
svm_model.fit(X_train_liar, y_train_liar)

# Predict on test data
y_pred_svm = svm_model.predict(X_test_liar)

# Show results
print("SVM Accuracy:", accuracy_score(y_test_liar, y_pred_svm))
print("\nClassification Report:\n")
print(classification_report(y_test_liar, y_pred_svm))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_fakeddit = LogisticRegression(max_iter=1000)
lr_fakeddit.fit(X_train_fak, y_train_fak)

y_pred_fakeddit = lr_fakeddit.predict(X_test_fak)

print("Fakeddit Accuracy:", accuracy_score(y_test_fak, y_pred_fakeddit))
print(classification_report(y_test_fak, y_pred_fakeddit))
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report



lr_fakeddit = LogisticRegression(max_iter=1000)

lr_fakeddit.fit(X_train_fak, y_train_fak)



y_pred_fakeddit = lr_fakeddit.predict(X_test_fak)



print("Fakeddit Accuracy:", accuracy_score(y_test_fak, y_pred_fakeddit))

print(classification_report(y_test_fak, y_pred_fakeddit))



In [None]:
fakeddit_label_map = {
      0: "True",
      1: "Satire",
      2: "Misleading",
      3: "False",
      4: "Clickbait",
      5: "Propaganda"
    }

In [None]:
def predict_fake_type(text):
      cleaned = clean_text(text)
      vector = tfidf_fakeddit.transform([cleaned])
      pred_num = lr_fakeddit.predict(vector)[0]
      return fakeddit_label_map.get(pred_num, pred_num)

In [None]:
def predict_truthfulness(text):
      cleaned = clean_text(text)
      vector = tfidf_liar.transform([cleaned])
      prediction = lr_model.predict(vector)
      return prediction[0]

In [None]:
user_input = input("Enter a news statement: ")
print("Truthfulness:", predict_truthfulness(user_input))
print("Type:", predict_fake_type(user_input))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_bfk = LogisticRegression(max_iter=1000)
lr_bfk.fit(X_train_bfk, y_train_bfk)

y_pred_lr_bfk = lr_bfk.predict(X_test_bfk)

print("Logistic Regression Accuracy:", accuracy_score(y_test_bfk, y_pred_lr_bfk))
print(classification_report(y_test_bfk, y_pred_lr_bfk))
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report



lr_bfk = LogisticRegression(max_iter=1000)

lr_bfk.fit(X_train_bfk, y_train_bfk)



y_pred_lr_bfk = lr_bfk.predict(X_test_bfk)



print("Logistic Regression Accuracy:", accuracy_score(y_test_bfk, y_pred_lr_bfk))

print(classification_report(y_test_bfk, y_pred_lr_bfk))

In [None]:
from sklearn.svm import LinearSVC

svm_bfk = LinearSVC()
svm_bfk.fit(X_train_bfk, y_train_bfk)

y_pred_svm_bfk = svm_bfk.predict(X_test_bfk)

print("SVM Accuracy:", accuracy_score(y_test_bfk, y_pred_svm_bfk))
print(classification_report(y_test_bfk, y_pred_svm_bfk))


In [None]:
small_bfk = bfk.sample(n=6000, random_state=42).reset_index(drop=True)

labels = small_bfk["Label"].astype("category")
label2id = {l:i for i,l in enumerate(labels.cat.categories)}
id2label = {i:l for l,i in label2id.items()}

small_bfk["label_id"] = labels.map(label2id)

In [None]:
from datasets import Dataset

dataset_bfk = Dataset.from_pandas(
    small_bfk[["Text", "label_id"]].rename(columns={"Text": "text", "label_id": "label"})
    )
dataset_bfk = dataset_bfk.train_test_split(test_size=0.2)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

    dataset_bfk = dataset_bfk.map(tokenize, batched=True)
    dataset_bfk.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
from transformers import AutoModelForSequenceClassification

model_bfk = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
        num_labels=len(label2id),
            id2label=id2label,
                label2id=label2id
                )


In [None]:
# Re-tokenize properly
def tokenize(batch):
    return tokenizer(
            batch["text"],
                    padding="max_length",
                            truncation=True,
                                    max_length=128
                                        )

dataset_bfk = dataset_bfk.map(tokenize, batched=True)

dataset_bfk.set_format(
                                            type="torch",
                                                columns=["input_ids", "attention_mask", "label"]
                                                )

                                                # Training
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
                                                    output_dir="./mbbert_bharat",
                                                        num_train_epochs=4,
                                                            per_device_train_batch_size=8,
                                                    learning_rate=2e-5,
                                                        weight_decay=0.01,

                                                                logging_steps=100
                                                                )
trainer_bfk = Trainer(
                                                                    model=model_bfk,
                                                                        args=training_args,
                                                                            train_dataset=dataset_bfk["train"],
                                                                                eval_dataset=dataset_bfk["test"]
                                                                                )

trainer_bfk.train()


In [None]:
metrics = trainer_bfk.evaluate()
print(metrics)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


In [None]:
import torch
from torch import nn
from transformers import Trainer

# Compute weights once using label_id
label_counts = small_bfk["label_id"].value_counts().sort_index()
class_weights = torch.tensor(
    1.0 / label_counts.values,
        dtype=torch.float
        )
class_weights = class_weights / class_weights.sum()

class WeightedTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                    labels = inputs.get("labels")
                    outputs = model(**inputs)
                    logits = outputs.get("logits")

                    loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
                    loss = loss_fct(logits, labels)

                    return (loss, outputs) if return_outputs else loss




In [None]:
trainer_bfk = WeightedTrainer(
      model=model_bfk,
          args=training_args,
              train_dataset=dataset_bfk["train"],
                  eval_dataset=dataset_bfk["test"],
                      compute_metrics=compute_metrics
                      )

In [None]:
trainer_bfk.train()

In [None]:
trainer_bfk.evaluate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import joblib

project_path = "/content/drive/MyDrive/Fake News Project"
os.makedirs(project_path, exist_ok=True)

# Save classical ML models
joblib.dump(lr_bfk, project_path + "/lr_bharat.pkl")
joblib.dump(svm_bfk, project_path + "/svm_bharat.pkl")
joblib.dump(tfidf_bfk, project_path + "/tfidf_bharat.pkl")

# Save mBERT model
trainer_bfk.save_model(project_path + "/mbert_bharat_model")
tokenizer.save_pretrained(project_path + "/mbert_bharat_model")

print("All models saved successfully!")

In [None]:
import os
os.listdir("/content/drive/MyDrive/Fake News Project")