In [1]:
! pip install -U accelerate
! pip install -U transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [8]:
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import pandas as pd

In [3]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # or another variant like "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
!unzip Fake.csv.zip

Archive:  Fake.csv.zip
  inflating: Fake.csv                


In [4]:
class EssaysDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        title = self.data.iloc[idx]['title']
        subject = self.data.iloc[idx]["subject"]

        input_text = f"[Subject]: {subject}\n[Title]: {title}\n[Text]: {text}"

        # Tokenize and truncate the input text
        input_ids = self.tokenizer.encode(
            input_text, max_length=self.block_size, truncation=True
        )

        return {"input_ids": torch.tensor(input_ids, dtype=torch.long)}

In [6]:
# Set your paths and filenames
train_data_path = "Fake.csv"
output_dir = "fine-tuned/model"

# Load your dataset using the custom class
dataset = EssaysDataset(
    file_path=train_data_path,
    tokenizer=tokenizer,
    block_size=128,
)

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [11]:
import accelerate

accelerate.__version__

'0.25.0'

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="fake_news_model",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_fake_news_model")
tokenizer.save_pretrained("fine_tuned_fake_news_model")