## Installation

In [None]:
!pip install transformers torch datasets peft

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accel

In [None]:
!pip install wandb -qU

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.2/249.2 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from tqdm import tqdm
import wandb
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, AdamW, GPT2Tokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

## Datasets and tokenization

In [None]:
#split

dataset = load_dataset('csv', delimiter='\t', data_files='/content/drive/MyDrive/Colab Notebooks/Fake News Generation/NewsCategoryDataset.csv')
#path to the dataset

dataset = dataset['train'].train_test_split(test_size=0.01)

In [None]:
dataset['train'][1]

{'Unnamed: 0': 34466,
 'headline': "trump's approach to the opioid epidemic: neglect treatment, ignore the experts",
 'category': 'POLITICS'}

## Baseline

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    text = [f"{category}: {headline}" for category, headline in zip(examples['category'], examples['headline'])]
    result = tokenizer(text, truncation=True, padding='max_length', max_length=50)
    result['labels'] = result['input_ids']
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets['train'].set_format("pt", ['input_ids','attention_mask', 'labels'])

model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda")

Map:   0%|          | 0/207431 [00:00<?, ? examples/s]

Map:   0%|          | 0/2096 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets['train'])

Dataset({
    features: ['Unnamed: 0', 'headline', 'category', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 207431
})


In [None]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


## PEFT + LoRA

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
config = LoraConfig(
    task_type= TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
)
#configuration from the article
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 405504 || all params: 124845312 || trainable%: 0.32


In [None]:
tokenized_datasets['train'][0]

{'input_ids': tensor([45359,    43,  6322, 10892,    25,   257,  9095,   338,  1570,   286,
          4173, 30188,   832,  5205, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'labels': tensor([45359,    43,  6322, 10892,    25,   257,  9095,   338,  1570,   286,
          4173, 30188,   832,  5205, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

In [None]:
wandb.login()
#use your wandb token

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
run = 2
lr = 2e-4
epochs = 1

In [None]:
wandb.init(
      # Set the project where this run will be logged
      project="FakeNewsLora",
      name=f"experiment_{run}",
      # Track hyperparameters and run metadata
      config={
      "learning_rate": lr,
      "architecture": "GPT2LMHeadModel",
      "dataset": "big",
      "epochs": epochs,
      })

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

## Training

In [None]:
model_name = "LORA_test"

training_args = TrainingArguments(
    output_dir=f"{model_name}-first",
    learning_rate=2e-4,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    eval_steps=300,
    evaluation_strategy="steps",
    remove_unused_columns=False,
    report_to="wandb",
)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["Unnamed: 0", "headline", "category"])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 207431
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2096
    })
})

In [None]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

trainer.train()

Step,Training Loss,Validation Loss
300,No log,1.305532
600,1.547400,1.260358
900,1.547400,1.239687
1200,1.330700,1.227133
1500,1.304200,1.218743
1800,1.304200,1.212906
2100,1.287200,1.208794
2400,1.287200,1.204785
2700,1.280200,1.203195
3000,1.274600,1.200993


TrainOutput(global_step=3242, training_loss=1.3326766092346305, metrics={'train_runtime': 2737.0996, 'train_samples_per_second': 75.785, 'train_steps_per_second': 1.184, 'total_flos': 5318209570867200.0, 'train_loss': 1.3326766092346305, 'epoch': 1.0})

## Save + load PEFT model

In [None]:
lora_model.save_pretrained("lora-weights", save_adapter=True, save_config=True)

In [None]:
model_to_merge = PeftModel.from_pretrained(path)              #GPT2LMHeadModel.from_pretrained("gpt2").to("cuda"), "/content/lora-weights")
merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(f"/content/drive/MyDrive/Colab Notebooks/Fake News Generation/{run}_lora_weights")

In [None]:
model2 = GPT2LMHeadModel.from_pretrained("gpt2")
model2 = model2.from_pretrained(f"/content/drive/MyDrive/Colab Notebooks/Fake News Generation/{run}_lora_weights").to('cuda')


## Inference

In [None]:
def generate_headline(model, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output = model.generate(input_ids, max_length=80, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    headline = tokenizer.decode(output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return headline

prompt = "SPORTS:"
for _ in range(5):
  print(generate_headline(model2, prompt))

 jimmy mccartney, jimmie kardashian, and jennifer o'hara are on the verge of breaking the record for most women's basketball wins
 james floyd is the greatest football player of all time
 u.s. women's national soccer team wins gold medal in silver medal game
 u.s. women's basketball team celebrates its 50th anniversary
 u.s. women's national soccer team will play in columbia
