In [45]:
from google.colab import drive
drive.mount('/content/drive')

import joblib
from transformers import AutoModelForSequenceClassification, AutoTokenizer

project_path = "/content/drive/MyDrive/Fake News Project"

# Load classical models
lr_bfk = joblib.load(project_path + "/lr_bharat.pkl")
svm_bfk = joblib.load(project_path + "/svm_bharat.pkl")
tfidf_bfk = joblib.load(project_path + "/tfidf_bharat.pkl")

# Load mBERT
model_bfk = AutoModelForSequenceClassification.from_pretrained(project_path + "/mbert_bharat_model")
tokenizer = AutoTokenizer.from_pretrained(project_path + "/mbert_bharat_model")

print("Models loaded successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Models loaded successfully!


In [43]:
# ---------------------------------------------------------
# MOUNT GOOGLE DRIVE
# ---------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
# ---------------------------------------------------------
# IMPORT REQUIRED LIBRARIES
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION (USED BY ALL DATASETS)
# ---------------------------------------------------------

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)              # remove URLs
    text = re.sub(r"[^a-zA-Z\u0900-\u097F ]", "", text)  # keep English + Hindi chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
liar_path = "/content/drive/MyDrive/Fake News Project/LIAR/train.tsv"

column_names = [
    "id","label","statement","subject","speaker","speaker_job","state","party",
        "barely_true_counts","false_counts","half_true_counts","mostly_true_counts",
            "pants_on_fire_counts","context"
            ]
df = pd.read_csv(liar_path, sep="\t", header=None, names=column_names)
print(df[["label","statement"]].head())


         label                                          statement
0        false  Says the Annies List political group supports ...
1    half-true  When did the decline of coal start? It started...
2  mostly-true  Hillary Clinton agrees with John McCain "by vo...
3        false  Health care reform legislation is likely to ma...
4    half-true  The economic turnaround started at the end of ...


In [None]:
df["clean_text"] = df["statement"].apply(clean_text)

In [None]:
tfidf_liar = TfidfVectorizer(max_features=3000)
X_liar = tfidf_liar.fit_transform(df["clean_text"])
y_liar = df["label"]

print("LIAR TF-IDF Shape:", X_liar.shape)

LIAR TF-IDF Shape: (10240, 3000)


In [None]:
# ---------------------------------------------------------
# LOAD FAKEDDIT DATASET
# ---------------------------------------------------------
fakeddit_path = "/content/drive/MyDrive/Fake News Project/Fakeddit/multimodal_train.tsv"

fakeddit = pd.read_csv(fakeddit_path, sep="\t")

print("Fakeddit Loaded:", fakeddit.shape)
print(fakeddit.head())

Fakeddit Loaded: (564000, 16)
           author                                        clean_title  \
0     Alexithymia  my walgreens offbrand mucinex was engraved wit...   
1        VIDCAs17                this concerned sink with a tiny hat   
2  prometheus1123      hackers leak emails from uae ambassador to us   
3             NaN                           puppy taking in the view   
4       3rikR3ith               i found a face in my sheet music too   

    created_utc         domain  hasImage      id  \
0  1.551641e+09    i.imgur.com      True  awxhir   
1  1.534727e+09      i.redd.it      True  98pbid   
2  1.496511e+09  aljazeera.com      True  6f2cy5   
3  1.471341e+09    i.imgur.com      True  4xypkv   
4  1.525318e+09      i.redd.it      True  8gnet9   

                                           image_url linked_submission_id  \
0  https://external-preview.redd.it/WylDbZrnbvZdB...                  NaN   
1  https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...                  

In [None]:
# Cleaned text
fakeddit["clean_text"] = fakeddit["clean_title"].apply(clean_text)

In [None]:
y_fakeddit = fakeddit["6_way_label"]
print("Unique Fakeddit 6-class labels:", y_fakeddit.unique())

Unique Fakeddit 6-class labels: [0 2 4 5 1 3]


In [None]:
tfidf_fakeddit = TfidfVectorizer(max_features=3000)
X_fakeddit = tfidf_fakeddit.fit_transform(fakeddit["clean_text"])

print("Fakeddit TF-IDF shape:", X_fakeddit.shape)

Fakeddit TF-IDF shape: (564000, 3000)


In [None]:
# ---------------------------------------------------------
# LOAD BHARATFAKENEWSKOSH
# ---------------------------------------------------------

bfk_path = "/content/drive/MyDrive/Fake News Project/BharatFakeNewsKosh/BharatFakeNewsKosh.xlsx"

bfk = pd.read_excel(bfk_path)

print("BharatFakeNewsKosh Loaded:", bfk.shape)
bfk.head()

BharatFakeNewsKosh Loaded: (26232, 19)


Unnamed: 0,id,Author_Name,Fact_Check_Source,Source_Type,Statement,Eng_Trans_Statement,News Body,Eng_Trans_News_Body,Media_Link,Publish_Date,Fact_Check_Link,News_Category,Language,Region,Platform,Text,Video,Image,Label
0,BFNK_1,Shinjinee Majumder,Alt News,IFCN,फ़ैक्ट-चेक: तेलंगाना में एक रिपोर्टर ने गृह मंत...,Fact-check: A reporter in Telangana stopped sp...,सोशल मीडिया पर एक वीडियो वायरल है जिसमें एक पत...,A video is viral on social media in which a jo...,https://i0.wp.com/www.altnews.in/Hindi/wp-cont...,9th July 2022,https://www.altnews.in/Hindi/fact-check-was-am...,Politics,Hindi,Telangana,Twitter,no,yes,no,False
1,BFNK_2,Kalim Ahmed,Alt News,IFCN,PM मोदी को UAE का सर्वोच्च नागरिक सम्मान मिलने...,Share by stating the old video of PM Modi's hi...,प्रधानमंत्री नरेंद्र मोदी को सोने की चेन से सम...,A video of Prime Minister Narendra Modi being ...,https://i0.wp.com/www.altnews.in/Hindi/wp-cont...,9th July 2022,https://www.altnews.in/Hindi/old-video-of-pm-m...,Politics,Hindi,National,Twitter,no,yes,no,False
2,BFNK_3,Abhishek Kumar,Alt News,IFCN,वायरल तस्वीर में सुप्रीम कोर्ट के जज सूर्यकांत...,Supreme Court judges Suryakant and JB Pardiwal...,बीते दिनों नूपुर शर्मा ने टीवी डिबेट में पैगम्...,"Recently, Nupur Sharma made an objectionable c...",https://i0.wp.com/www.altnews.in/Hindi/wp-cont...,7th July 2022,https://www.altnews.in/Hindi/false-claim-with-...,Politics,Hindi,National,Twitter,no,no,yes,False
3,BFNK_4,Abhishek Kumar,Alt News,IFCN,मीडिया ने दी ग़लत ख़बर: कटनी में मुस्लिम सरपंच क...,Media gave wrong news: After the victory of Mu...,एक वीडियो सोशल मीडिया पर वायरल है. इसे शेयर कर...,A video is viral on social media. While sharin...,https://i0.wp.com/www.altnews.in/Hindi/wp-cont...,5th July 2022,https://www.altnews.in/Hindi/media-misreport-p...,Politics,Hindi,Madhya Pradesh,Twitter,no,no,yes,False
4,BFNK_5,Kinjal,Alt News,IFCN,महिला ने राहुल गांधी को कश्मीर मुद्दे पर मोदी ...,The woman lashed out at Rahul Gandhi to oppose...,सोशल मीडिया पर राहुल गांधी का एक वीडियो वायरल ...,A video of Rahul Gandhi has gone viral on soci...,https://i0.wp.com/www.altnews.in/Hindi/wp-cont...,4th July 2022,https://www.altnews.in/Hindi/2019-video-of-wom...,Politics,Hindi,Kashmir,Twitter,no,yes,no,True


In [None]:
print("\nLanguages:", bfk["Language"].unique())
print("\nLabels:", bfk["Label"].unique())


Languages: ['Hindi' 'English' 'Bangla' 'Gujarati' 'Malayalam' 'Telugu' 'Assamese'
 'Odia' 'Tamil']

Labels: [False  True]


In [None]:
# Use English translation of the statement
bfk["clean_text"] = bfk["Eng_Trans_Statement"].apply(clean_text)

print("\nCleaned Sample:")
print(bfk[["Eng_Trans_Statement", "clean_text"]].head())


Cleaned Sample:
                                 Eng_Trans_Statement  \
0  Fact-check: A reporter in Telangana stopped sp...   
1  Share by stating the old video of PM Modi's hi...   
2  Supreme Court judges Suryakant and JB Pardiwal...   
3  Media gave wrong news: After the victory of Mu...   
4  The woman lashed out at Rahul Gandhi to oppose...   

                                          clean_text  
0  factcheck a reporter in telangana stopped spea...  
1  share by stating the old video of pm modis hig...  
2  supreme court judges suryakant and jb pardiwal...  
3  media gave wrong news after the victory of mus...  
4  the woman lashed out at rahul gandhi to oppose...  


In [None]:
tfidf_bfk = TfidfVectorizer(max_features=3000)
X_bfk = tfidf_bfk.fit_transform(bfk["clean_text"])

y_bfk = bfk["Label"]

print("Multilingual TF-IDF Shape:", X_bfk.shape)

Multilingual TF-IDF Shape: (26232, 3000)


In [None]:
# ---------------------------------------------------------
# TRAIN/TEST SPLITS FOR ALL 3 DATASETS
# ---------------------------------------------------------

# 1️⃣ LIAR Dataset (Truthfulness Classification)
X_train_liar, X_test_liar, y_train_liar, y_test_liar = train_test_split(
    X_liar, y_liar, test_size=0.2, random_state=42
)
print("LIAR Train/Test Split:")
print("Train:", X_train_liar.shape)
print("Test:", X_test_liar.shape)
print("\n")

# 2️⃣ Fakeddit Dataset (Fake News Type Classification)
X_train_fak, X_test_fak, y_train_fak, y_test_fak = train_test_split(
    X_fakeddit, y_fakeddit, test_size=0.2, random_state=42
)
print("Fakeddit Train/Test Split:")
print("Train:", X_train_fak.shape)
print("Test:", X_test_fak.shape)
print("\n")

# 3️⃣ BharatFakeNewsKosh (Multilingual Fake News Classification)
X_train_bfk, X_test_bfk, y_train_bfk, y_test_bfk = train_test_split(
    X_bfk, y_bfk, test_size=0.2, random_state=42
)
print("BharatFakeNewsKosh Train/Test Split:")
print("Train:", X_train_bfk.shape)
print("Test:", X_test_bfk.shape)

LIAR Train/Test Split:
Train: (8192, 3000)
Test: (2048, 3000)


Fakeddit Train/Test Split:
Train: (451200, 3000)
Test: (112800, 3000)


BharatFakeNewsKosh Train/Test Split:
Train: (20985, 3000)
Test: (5247, 3000)


In [None]:
print(df["label"].unique())


['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create the model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train_liar, y_train_liar)

# Predict on test data
y_pred_lr = lr_model.predict(X_test_liar)

# Show results
print("Accuracy:", accuracy_score(y_test_liar, y_pred_lr))
print("\nClassification Report:\n")
print(classification_report(y_test_liar, y_pred_lr))


Accuracy: 0.236328125

Classification Report:

              precision    recall  f1-score   support

 barely-true       0.21      0.17      0.18       339
       false       0.23      0.30      0.26       401
   half-true       0.24      0.28      0.26       438
 mostly-true       0.25      0.28      0.26       382
  pants-fire       0.30      0.08      0.13       148
        true       0.25      0.19      0.21       340

    accuracy                           0.24      2048
   macro avg       0.25      0.22      0.22      2048
weighted avg       0.24      0.24      0.23      2048



In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Create SVM model
svm_model = LinearSVC(max_iter=5000)

# Train the model
svm_model.fit(X_train_liar, y_train_liar)

# Predict on test data
y_pred_svm = svm_model.predict(X_test_liar)

# Show results
print("SVM Accuracy:", accuracy_score(y_test_liar, y_pred_svm))
print("\nClassification Report:\n")
print(classification_report(y_test_liar, y_pred_svm))

SVM Accuracy: 0.22021484375

Classification Report:

              precision    recall  f1-score   support

 barely-true       0.18      0.17      0.18       339
       false       0.24      0.25      0.25       401
   half-true       0.24      0.25      0.24       438
 mostly-true       0.23      0.25      0.24       382
  pants-fire       0.20      0.18      0.19       148
        true       0.20      0.18      0.19       340

    accuracy                           0.22      2048
   macro avg       0.22      0.21      0.21      2048
weighted avg       0.22      0.22      0.22      2048



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_fakeddit = LogisticRegression(max_iter=1000)
lr_fakeddit.fit(X_train_fak, y_train_fak)

y_pred_fakeddit = lr_fakeddit.predict(X_test_fak)

print("Fakeddit Accuracy:", accuracy_score(y_test_fak, y_pred_fakeddit))
print(classification_report(y_test_fak, y_pred_fakeddit))
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report



lr_fakeddit = LogisticRegression(max_iter=1000)

lr_fakeddit.fit(X_train_fak, y_train_fak)



y_pred_fakeddit = lr_fakeddit.predict(X_test_fak)



print("Fakeddit Accuracy:", accuracy_score(y_test_fak, y_pred_fakeddit))

print(classification_report(y_test_fak, y_pred_fakeddit))



Fakeddit Accuracy: 0.6779698581560284
              precision    recall  f1-score   support

           0       0.70      0.84      0.76     44222
           1       0.49      0.13      0.21      6738
           2       0.61      0.44      0.51     21627
           3       0.49      0.12      0.19      2292
           4       0.69      0.80      0.74     33685
           5       0.72      0.47      0.57      4236

    accuracy                           0.68    112800
   macro avg       0.62      0.47      0.50    112800
weighted avg       0.66      0.68      0.65    112800

Fakeddit Accuracy: 0.6779698581560284
              precision    recall  f1-score   support

           0       0.70      0.84      0.76     44222
           1       0.49      0.13      0.21      6738
           2       0.61      0.44      0.51     21627
           3       0.49      0.12      0.19      2292
           4       0.69      0.80      0.74     33685
           5       0.72      0.47      0.57      4236

 

In [None]:
fakeddit_label_map = {
      0: "True",
      1: "Satire",
      2: "Misleading",
      3: "False",
      4: "Clickbait",
      5: "Propaganda"
    }

In [None]:
def predict_fake_type(text):
      cleaned = clean_text(text)
      vector = tfidf_fakeddit.transform([cleaned])
      pred_num = lr_fakeddit.predict(vector)[0]
      return fakeddit_label_map.get(pred_num, pred_num)

In [None]:
def predict_truthfulness(text):
      cleaned = clean_text(text)
      vector = tfidf_liar.transform([cleaned])
      prediction = lr_model.predict(vector)
      return prediction[0]

In [None]:
user_input = input("Enter a news statement: ")
print("Truthfulness:", predict_truthfulness(user_input))
print("Type:", predict_fake_type(user_input))


Enter a news statement: Apple is red
Truthfulness: true
Type: True


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_bfk = LogisticRegression(max_iter=1000)
lr_bfk.fit(X_train_bfk, y_train_bfk)

y_pred_lr_bfk = lr_bfk.predict(X_test_bfk)

print("Logistic Regression Accuracy:", accuracy_score(y_test_bfk, y_pred_lr_bfk))
print(classification_report(y_test_bfk, y_pred_lr_bfk))
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report



lr_bfk = LogisticRegression(max_iter=1000)

lr_bfk.fit(X_train_bfk, y_train_bfk)



y_pred_lr_bfk = lr_bfk.predict(X_test_bfk)



print("Logistic Regression Accuracy:", accuracy_score(y_test_bfk, y_pred_lr_bfk))

print(classification_report(y_test_bfk, y_pred_lr_bfk))

Logistic Regression Accuracy: 0.5761387459500668
              precision    recall  f1-score   support

       False       0.44      0.15      0.23      2137
        True       0.60      0.87      0.71      3110

    accuracy                           0.58      5247
   macro avg       0.52      0.51      0.47      5247
weighted avg       0.53      0.58      0.51      5247

Logistic Regression Accuracy: 0.5761387459500668
              precision    recall  f1-score   support

       False       0.44      0.15      0.23      2137
        True       0.60      0.87      0.71      3110

    accuracy                           0.58      5247
   macro avg       0.52      0.51      0.47      5247
weighted avg       0.53      0.58      0.51      5247



In [None]:
from sklearn.svm import LinearSVC

svm_bfk = LinearSVC()
svm_bfk.fit(X_train_bfk, y_train_bfk)

y_pred_svm_bfk = svm_bfk.predict(X_test_bfk)

print("SVM Accuracy:", accuracy_score(y_test_bfk, y_pred_svm_bfk))
print(classification_report(y_test_bfk, y_pred_svm_bfk))


SVM Accuracy: 0.5477415666094911
              precision    recall  f1-score   support

       False       0.41      0.26      0.32      2137
        True       0.59      0.74      0.66      3110

    accuracy                           0.55      5247
   macro avg       0.50      0.50      0.49      5247
weighted avg       0.52      0.55      0.52      5247



In [None]:
small_bfk = bfk.sample(n=6000, random_state=42).reset_index(drop=True)

labels = small_bfk["Label"].astype("category")
label2id = {l:i for i,l in enumerate(labels.cat.categories)}
id2label = {i:l for l,i in label2id.items()}

small_bfk["label_id"] = labels.map(label2id)

In [None]:
from datasets import Dataset

dataset_bfk = Dataset.from_pandas(
    small_bfk[["Text", "label_id"]].rename(columns={"Text": "text", "label_id": "label"})
    )
dataset_bfk = dataset_bfk.train_test_split(test_size=0.2)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

    dataset_bfk = dataset_bfk.map(tokenize, batched=True)
    dataset_bfk.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model_bfk = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
        num_labels=len(label2id),
            id2label=id2label,
                label2id=label2id
                )


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-multilingual-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
# Re-tokenize properly
def tokenize(batch):
    return tokenizer(
            batch["text"],
                    padding="max_length",
                            truncation=True,
                                    max_length=128
                                        )

dataset_bfk = dataset_bfk.map(tokenize, batched=True)

dataset_bfk.set_format(
                                            type="torch",
                                                columns=["input_ids", "attention_mask", "label"]
                                                )

                                                # Training
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
                                                    output_dir="./mbbert_bharat",
                                                        num_train_epochs=4,
                                                            per_device_train_batch_size=8,
                                                    learning_rate=2e-5,
                                                        weight_decay=0.01,

                                                                logging_steps=100
                                                                )
trainer_bfk = Trainer(
                                                                    model=model_bfk,
                                                                        args=training_args,
                                                                            train_dataset=dataset_bfk["train"],
                                                                                eval_dataset=dataset_bfk["test"]
                                                                                )

trainer_bfk.train()


Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Step,Training Loss
100,0.691352
200,0.688961
300,0.679566
400,0.667953
500,0.681669
600,0.67545
700,0.688784
800,0.669819
900,0.676459
1000,0.679741


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=2400, training_loss=0.6784607028961182, metrics={'train_runtime': 753.1603, 'train_samples_per_second': 25.493, 'train_steps_per_second': 3.187, 'total_flos': 1262933065728000.0, 'train_loss': 0.6784607028961182, 'epoch': 4.0})

In [None]:
metrics = trainer_bfk.evaluate()
print(metrics)


{'eval_loss': 0.6744440197944641, 'eval_runtime': 8.8912, 'eval_samples_per_second': 134.965, 'eval_steps_per_second': 16.871, 'epoch': 4.0}


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


In [None]:
import torch
from torch import nn
from transformers import Trainer

# Compute weights once using label_id
label_counts = small_bfk["label_id"].value_counts().sort_index()
class_weights = torch.tensor(
    1.0 / label_counts.values,
        dtype=torch.float
        )
class_weights = class_weights / class_weights.sum()

class WeightedTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                    labels = inputs.get("labels")
                    outputs = model(**inputs)
                    logits = outputs.get("logits")

                    loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
                    loss = loss_fct(logits, labels)

                    return (loss, outputs) if return_outputs else loss




In [None]:
trainer_bfk = WeightedTrainer(
      model=model_bfk,
          args=training_args,
              train_dataset=dataset_bfk["train"],
                  eval_dataset=dataset_bfk["test"],
                      compute_metrics=compute_metrics
                      )

In [None]:
trainer_bfk.train()

Step,Training Loss
100,0.696536
200,0.698807
300,0.696574
400,0.692898
500,0.699659
600,0.692455
700,0.694061
800,0.693045
900,0.693271
1000,0.695156


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=2400, training_loss=0.6947615559895833, metrics={'train_runtime': 712.9768, 'train_samples_per_second': 26.929, 'train_steps_per_second': 3.366, 'total_flos': 1262933065728000.0, 'train_loss': 0.6947615559895833, 'epoch': 4.0})

In [None]:
trainer_bfk.evaluate()

{'eval_loss': 0.6928268074989319,
 'eval_accuracy': 0.5966666666666667,
 'eval_runtime': 9.1011,
 'eval_samples_per_second': 131.852,
 'eval_steps_per_second': 16.482,
 'epoch': 4.0}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import joblib

project_path = "/content/drive/MyDrive/Fake News Project"
os.makedirs(project_path, exist_ok=True)

# Save classical ML models
joblib.dump(lr_bfk, project_path + "/lr_bharat.pkl")
joblib.dump(svm_bfk, project_path + "/svm_bharat.pkl")
joblib.dump(tfidf_bfk, project_path + "/tfidf_bharat.pkl")

# Save mBERT model
trainer_bfk.save_model(project_path + "/mbert_bharat_model")
tokenizer.save_pretrained(project_path + "/mbert_bharat_model")

print("All models saved successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

All models saved successfully!


In [None]:
import os
os.listdir("/content/drive/MyDrive/Fake News Project")

['LIAR',
 'Fakeddit',
 'BharatFakeNewsKosh',
 'fake_news_progress.ipynb',
 'lr_bharat.pkl',
 'svm_bharat.pkl',
 'tfidf_bharat.pkl',
 'mbert_bharat_model']