# DATASET 1

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset

!pip install transformers
!pip install accelerate -U
!pip install sklearn

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 requires a pad token

fake_news_path = r"C:\Users\adity\Desktop\NLP\April 11\DATASET 1\fake.csv"
true_news_path = r"C:\Users\adity\Desktop\NLP\April 11\DATASET 1\true.csv"
fake_news = pd.read_csv(fake_news_path)
true_news = pd.read_csv(true_news_path)

fake_news['label'] = 0
true_news['label'] = 1
combined_news = pd.concat([fake_news, true_news], ignore_index=True)

combined_news = combined_news.sample(frac=0.03).reset_index(drop=True)

train_news, eval_news = train_test_split(combined_news, test_size=0.2)  # 20% for evaluation

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def prepare_dataset(news):
    encodings = tokenizer(news['text'].tolist(), return_tensors='pt', max_length=512, truncation=True, padding='max_length', add_special_tokens=True)
    labels = news['label'].tolist()
    return NewsDataset(encodings, labels)

train_dataset = prepare_dataset(train_news)
eval_dataset = prepare_dataset(eval_news)

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.evaluate()


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/1b/e8/2fc7af3fa77ddac89a9c9b390d2d31d1db0612247ba2274009946959604e/accelerate-0.29.2-py3-none-any.whl.metadata
  Downloading accelerate-0.29.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
   ---------------------------------------- 0.0/297.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/297.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/297.4 kB ? eta -:--:--
   -------- ------------------------------ 61.4/297.4 kB 544.7 kB/s eta 0:00:01
   -------- ------------------------------ 61.4/297.4 kB 544.7 kB/s eta 0:00:01
   ---------------------------- --------- 225.3/297.4 kB 981.9 kB/s eta 0:00:01
   ---------------------------------------- 297.4/297.4 kB 1.2 MB/s eta 0:00:00
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accel

  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,1.0687
20,0.4602
30,0.3772
40,0.1785
50,0.0183
60,0.0032
70,0.0005
80,0.0
90,0.0
100,0.0015


{'eval_loss': 0.13819581270217896,
 'eval_runtime': 391.896,
 'eval_samples_per_second': 0.689,
 'eval_steps_per_second': 0.087,
 'epoch': 1.0}

# EVALUATION

In [2]:
from torch.utils.data import DataLoader

eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)  # Adjust batch_size as needed

model.eval() 
predictions, true_labels = [], []

with torch.no_grad():
    for batch in eval_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=-1)
        predictions.extend(pred_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [3]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

predictions = np.array(predictions)
true_labels = np.array(true_labels)

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

print(f"Accuracy: {accuracy :.2f}")
print(f"Precision: {precision :.2f}")
print(f"Recall: {recall :.2f}")
print(f"F1 Score: {f1 :.2f}")



Accuracy: 0.98
Precision: 0.97
Recall: 0.99
F1 Score: 0.98


In [4]:
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd

predictions = np.array(predictions)
true_labels = np.array(true_labels)

report = classification_report(true_labels, predictions, target_names=['Fake', 'True'], output_dict=True)

report_df = pd.DataFrame(report).transpose()

report_df = report_df.round(2).fillna('')

print(report_df)

              precision  recall  f1-score  support
Fake               0.99    0.97      0.98   132.00
True               0.97    0.99      0.98   138.00
accuracy           0.98    0.98      0.98     0.98
macro avg          0.98    0.98      0.98   270.00
weighted avg       0.98    0.98      0.98   270.00
