# Training an Article Title Generation Model with Bart

## Install libraries and download the dataset

Load kaggle.json file.

In [27]:
!pip install datasets==2.21.0 transformers rouge-score nltk

Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.0
    Uninstalling datasets-3.0.0:
      Successfully uninstalled datasets-3.0.0
Successfully installed datasets-2.21.0


In [2]:
!kaggle

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 7, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 407, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory


In [4]:
!kaggle datasets download -d fabiochiusano/medium-articles

Dataset URL: https://www.kaggle.com/datasets/fabiochiusano/medium-articles
License(s): CC0-1.0
Downloading medium-articles.zip to /content
 99% 367M/369M [00:21<00:00, 23.8MB/s]
100% 369M/369M [00:21<00:00, 18.1MB/s]


In [5]:
!pip install accelerate -U



## Load the dataset

In [10]:
import transformers
from datasets import load_dataset, load_metric, Dataset

In [2]:
medium_datasets = load_dataset("csv", data_files="medium-articles.zip")

In [3]:
medium_datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 192368
    })
})

## Dataset train/validation/test split

In [11]:
!git clone https://github.com/anukvma/group18_final_project.git

Cloning into 'group18_final_project'...
remote: Enumerating objects: 208, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 208 (delta 19), reused 8 (delta 4), pack-reused 170 (from 1)[K
Receiving objects: 100% (208/208), 3.68 MiB | 19.14 MiB/s, done.
Resolving deltas: 100% (104/104), done.


In [4]:
import os
import json
import pandas as pd

# Define the folder containing the text files
folder_path = '/content/group18_final_project/aiml_question_answers/AIML_QnA_Content/Group18_AIML_QA.csv'
df = pd.read_csv("/content/group18_final_project/aiml_question_answers/AIML_QnA_Content/Group18_AIML_QA.csv", names=['id','question','answer','unit'],encoding='unicode_escape',header=0)
df1 = pd.read_csv("/content/group18_final_project/aiml_question_answers/sampled_qa_data.csv", names=['id','question','answer','unit'],encoding='unicode_escape',header=0)
df = pd.concat([df, df1])


In [13]:
df.head()

Unnamed: 0,id,question,answer,unit
0,1.0,What is a linear classifier?,A linear classifier is a model that makes pred...,1.0
1,2.0,How does a linear classifier make predictions?,A linear classifier predicts by calculating th...,1.0
2,3.0,What is the objective function in a linear cla...,The objective function often used is the loss ...,1.0
3,4.0,What is gradient descent?,Gradient descent is an optimization algorithm ...,1.0
4,5.0,How does learning rate affect gradient descent?,The learning rate controls the step size in gr...,1.0


In [5]:
df.dropna(axis=0, inplace=True)

In [6]:
df.isna().sum()

Unnamed: 0,0
id,0
question,0
answer,0
unit,0


In [21]:
df = df.sample(frac=1).reset_index(drop=True)

In [22]:
train_dataset: Dataset = Dataset.from_pandas(df[:800])
validation_dataset: Dataset = Dataset.from_pandas(df[800:900])
test_dataset: Dataset = Dataset.from_pandas(df[900:])

In [None]:
train_dataset

Dataset({
    features: ['id', 'question', 'answer', 'unit', '__index_level_0__'],
    num_rows: 700
})

In [23]:
# keep only a subsample of the datasets
medium_datasets["train"] = train_dataset
medium_datasets["validation"] = validation_dataset
medium_datasets["test"] = test_dataset


In [18]:
medium_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'unit', '__index_level_0__'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'unit', '__index_level_0__'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'unit', '__index_level_0__'],
        num_rows: 127
    })
})

## Data preprocessing

In [11]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
model_checkpoint = "facebook/bart-large-xsum"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [24]:
prefix = "Answer the AIML question: "

max_input_length = 512
max_target_length = 512

def clean_text(text):
  sentences = nltk.sent_tokenize(text.strip())
  sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
  sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                 if len(sent) > 0 and
                                 sent[-1] in string.punctuation]
  text_cleaned = "\n".join(sentences_cleaned_no_titles)
  return text_cleaned

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["question"]]
  inputs = [prefix + text for text in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["answer"], max_length=max_target_length,
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [25]:
tokenized_datasets = medium_datasets.map(preprocess_data, batched=True)
tokenized_datasets

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'unit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'unit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'unit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 127
    })
})

## Fine-tune T5

In [15]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
!rm -r {model_dir}

rm: cannot remove '{model_dir}': No such file or directory


In [27]:
batch_size = 8
model_name = "bart-aiml-question-answer"
model_dir = f"/content/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)



In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [18]:
import numpy as np

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [28]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [29]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,3.0146,2.57083,33.1375,14.4156,27.3744,28.1903,31.85
200,2.0877,2.501255,35.0152,15.1453,28.7417,29.8227,34.11
300,1.5074,2.540349,35.207,14.8075,28.7006,30.7159,41.57
400,1.0894,2.717226,34.0065,14.434,27.868,29.71,41.61
500,0.7766,2.945216,36.2289,16.2099,29.7204,31.3758,40.04
600,0.5774,3.062018,35.7867,16.2062,29.8686,31.7197,41.52


Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
There were

TrainOutput(global_step=600, training_loss=1.5088501103719076, metrics={'train_runtime': 686.3418, 'train_samples_per_second': 6.994, 'train_steps_per_second': 0.874, 'total_flos': 321103952216064.0, 'train_loss': 1.5088501103719076, 'epoch': 6.0})

In [30]:
trainer.save_model()

Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


## Load the model from GDrive

In [31]:
model_name = "bart-aiml-question-answer"
model_dir = f"/content/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [32]:
text = """
What is a linear classifier?
"""

inputs = ["Answer this AIML question: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=512)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)
# Session State and Callbacks in Streamlit

A linear classifier is a model used to classify data points along a line.


In [33]:
text = """
What is Q-Learning, and how does it work?
"""

inputs = ["Answer this AIML question: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=512)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)
# Conversational AI: The Future of Customer Service

Q-Learning is a Python library that uses self-attention mechanisms to parallelize the training and testing of neural networks.


## Upload the model to the Hugging Space Hub

https://huggingface.co/docs/transformers/model_sharing

In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
model.push_to_hub(repo_id="anukvma/bart-aiml-question-answer-v2")
tokenizer.push_to_hub(repo_id="anukvma/bart-aiml-question-answer-v2")

Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/anukvma/bart-aiml-question-answer-v2/commit/6f558d7190755cf1329eb892a7d9776fed91fcc1', commit_message='Upload tokenizer', commit_description='', oid='6f558d7190755cf1329eb892a7d9776fed91fcc1', pr_url=None, pr_revision=None, pr_num=None)

## Load the model from the Hugging Face Hub

In [37]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')
max_input_length=512
tokenizer = AutoTokenizer.from_pretrained("anukvma/bart-aiml-question-answer-v2")
model = AutoModelForSeq2SeqLM.from_pretrained("anukvma/bart-aiml-question-answer-v2")

text = """
What is a linear classifier?
"""

inputs = ["Answer this AIML question: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=512)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

A linear classifier is a model used for classification of data points along a line.


## Evaluate the model on the test set

In [38]:
import torch

# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
  inputs = [prefix + text for text in examples["question"]]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
  return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=8)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["answer"], max_length=100,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

{'rouge1': 38.3593,
 'rouge2': 17.502,
 'rougeL': 31.3383,
 'rougeLsum': 33.4465,
 'gen_len': 40.5906}