In [2]:
!pip install transformers==4.38.2
!pip install accelerate -U
!pip install sentencepiece
!pip install rouge
!pip install wandb onnx -Uq



In [4]:
!pip uninstall -y transformers
!pip install transformers --upgrade --no-cache-dir

Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m174.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m240.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers


In [2]:
!git clone https://github.com/aarushi211/TOS-Summarization.git

Cloning into 'TOS-Summarization'...
remote: Enumerating objects: 291, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 291 (delta 38), reused 38 (delta 16), pack-reused 201 (from 1)[K
Receiving objects: 100% (291/291), 845.92 KiB | 2.50 MiB/s, done.
Resolving deltas: 100% (138/138), done.


In [1]:
from transformers import LEDForConditionalGeneration, LEDTokenizer, Trainer, TrainingArguments,pipeline,PretrainedConfig

In [2]:
# from transformers import LEDForConditionalGeneration, LEDTokenizer, Trainer, TrainingArguments,pipeline,PretrainedConfig
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from rouge import Rouge
import pandas as pd
import os
import wandb
import random
import numpy as np
import accelerate

os.environ["WANDB_PROJECT"]="major-one"
os.environ["WANDB_LOG_MODEL"]="checkpoint"
os.environ["WANDB_WATCH"]="all"



# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Dataset location
filename = "./TOS-Summarization/Dataset/all_v1_transpose.csv"

In [None]:
!pip install wandb -U



In [3]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maarushi-jain211[0m ([33mfaltu-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
df = pd.read_csv(filename)
df = df[['original_text','reference_summary']]
df.rename(columns = {'original_text':'source', 'reference_summary':'target'}, inplace = True)
len(df)

446

In [5]:
X = df['source']
y = df['target']

In [6]:
df.head()

Unnamed: 0,source,target
0,welcome to the pokémon go video game services ...,hi.
1,by using our services you are agreeing to thes...,by playing this game you agree to these terms....
2,if you want to use certain features of the ser...,you have to use google pokemon trainer club or...
3,during game play please be aware of your surro...,don t die or hurt others and if you do it s no...
4,subject to your compliance with these terms ni...,don t copy modify resell distribute or reverse...


In [7]:
class LEDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [8]:
def prepare_data(model_name,
                 train_texts, train_labels,
                 test_texts, test_labels):
  """
  Prepare input data for model fine-tuning
  """

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):

    encodings = tokenizer(texts, truncation=True, padding=True, max_length = 600)
    decodings = tokenizer(labels, truncation=True, padding=True, max_length = 256)
    dataset_tokenized = LEDDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, test_dataset, tokenizer

In [12]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, test_dataset, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = LEDForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if test_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,              # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      eval_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
      report_to="wandb",
      run_name = "longformer"
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=test_dataset,           # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,              # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

train_texts, train_labels = list(X_train), list(y_train)
test_texts, test_labels = list(X_test), list(y_test)

In [14]:
model_name = 'allenai/led-base-16384'

train_dataset,test_dataset, tokenizer = prepare_data(model_name, train_texts, train_labels,test_texts,test_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset,test_dataset)

trainer.train()

trainer.evaluate(test_dataset)

wandb.finish()

  trainer = Trainer(


Input ids are automatically padded from 600 to 1024 to be a multiple of `config.attention_window`: 1024


Step,Training Loss,Validation Loss
100,10.0921,5.407535
200,3.4532,1.222938
300,0.9867,0.524449
400,0.6885,0.490826
500,0.5866,0.473507
600,0.5783,0.424223
700,0.3999,0.398012


Input ids are automatically padded from 539 to 1024 to be a multiple of `config.attention_window`: 1024
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-500)... Done. 76.4s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-712)... Done. 22.2s


0,1
eval/loss,█▂▁▁▁▁▁▁
eval/runtime,▁█▆▅▆▅▆▅
eval/samples_per_second,█▁▃▃▂▃▂▃
eval/steps_per_second,█▁▃▃▂▃▂▃
train/epoch,▁▁▂▂▃▃▄▄▆▆▇▇████
train/global_step,▁▁▂▂▃▃▄▄▆▆▇▇████
train/grad_norm,█▆▅▂▁▂▁
train/learning_rate,▂▄▅▇█▅▁
train/loss,█▃▁▁▁▁▁

0,1
eval/loss,0.3975
eval/runtime,11.3271
eval/samples_per_second,7.946
eval/steps_per_second,7.946
total_flos,281622926131200.0
train/epoch,2.0
train/global_step,712.0
train/grad_norm,2.35757
train/learning_rate,0.0
train/loss,0.3999


In [15]:
import os
if not os.path.exists('./ouput_model/'):
    os.makedirs('./ouput_model/')
trainer.model.save_pretrained("./ouput_model/")

Inference

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
config = PretrainedConfig.from_json_file('./ouput_model/config.json')

In [18]:
model = LEDForConditionalGeneration.from_pretrained("./ouput_model/").to(device)

In [19]:
def summarize(text):
  input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True).to(device)
  summary_ids = model.generate(input_tokenized,
                                  num_beams=9,
                                  no_repeat_ngram_size=3,
                                  length_penalty=2.0,
                                  min_length=50,
                                  max_length=150,
                                  early_stopping=True)
  summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]

  return summary

In [20]:
y_pred = X_test.apply(lambda x: summarize(x))

Input ids are automatically padded from 80 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 26 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 133 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 37 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 52 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 41 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 117 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 62 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 199 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 33 to 1024 to be a multiple of

In [21]:
summary = pd.concat([y_test.to_frame(name="reference_summary"), y_pred.to_frame(name="generated_summary")], axis=1)

In [22]:
rouge = Rouge()

In [23]:
rouge.get_scores(summary['generated_summary'], summary['reference_summary'],avg=True)

{'rouge-1': {'r': 0.5334781875713248,
  'p': 0.2132543266527927,
  'f': 0.2897391765684265},
 'rouge-2': {'r': 0.28461445898637894,
  'p': 0.09521665734860855,
  'f': 0.1349559741024794},
 'rouge-l': {'r': 0.49919100906355796,
  'p': 0.19860182961287098,
  'f': 0.27051192993895784}}

Saving Model to Huggingface

In [24]:
!pip install -q huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `write-token` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authen

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model.save_pretrained("longformer-TOS")
tokenizer.save_pretrained("longformer-TOS")

('longformer-TOS/tokenizer_config.json',
 'longformer-TOS/special_tokens_map.json',
 'longformer-TOS/vocab.json',
 'longformer-TOS/merges.txt',
 'longformer-TOS/added_tokens.json',
 'longformer-TOS/tokenizer.json')

In [27]:
from huggingface_hub import create_repo, upload_folder

hf_username = "aarushi-211"
model_name = "TOS-Longformer"
repo_id = f"{hf_username}/{model_name}"

create_repo(repo_id=repo_id, exist_ok=True)

upload_folder(
    folder_path="longformer-TOS",
    repo_id=repo_id,
    repo_type="model"
)

Uploading...:   0%|          | 0.00/648M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aarushi-211/TOS-Longformer/commit/366916ee40eac97632ca17b3942e00ae6f37c5c3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='366916ee40eac97632ca17b3942e00ae6f37c5c3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aarushi-211/TOS-Longformer', endpoint='https://huggingface.co', repo_type='model', repo_id='aarushi-211/TOS-Longformer'), pr_revision=None, pr_num=None)