In [2]:
import os
import pandas as pd
import re
import random
import spacy
from sklearn.model_selection import train_test_split
import numpy as np

from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import cuda, tensor
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.optim as optim
from huggingface_hub import HfFolder, Repository, create_repo

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import BertTokenizer, BertModel
from bert_score import BERTScorer

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from transformers import Trainer, TrainingArguments

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cwd = '/content/drive/MyDrive/LLMs'
cwd

'/content/drive/MyDrive/LLMs'

In [4]:
filename = f'{cwd}/subreddits_train_data.parquet'
raw_df = pd.read_parquet(filename)
raw_df.head()

Unnamed: 0,PostText
0,Text: Nobody can read all the questions and an...
1,Text: ( Please Be Aware: We expect everyone to...
2,Text: Mods will remove questions which we deem...
3,Text: We will remove answers which don't inclu...
4,Text: These removals will be without notice.


In [5]:
raw_df['PostText'].values[0]

"Text: Nobody can read all the questions and answers that are posted here, so in this thread we invite you to share anything you'd like to highlight from the last week - an interesting discussion, an informative answer, an insightful question that was overlooked, or anything else."

In [5]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
def shuffle_text(text):
    keywords = []
    for item in nlp(text):
        if not item.is_stop:
            keywords.append(item.text)
    keywords_para = ' '.join(keywords)

    return keywords_para

def extract_keywords(text):
    try:
        split_text = text.split("Text: ")
        text = split_text[1]
        keywords_para = shuffle_text(text)

        return {
        "text": text,
        "keywords": keywords_para
        }
    except:
        return {'text': '', 'keywords': ''}

In [9]:
keywords = []
texts = []
for post in raw_df['PostText'].values:
    result = extract_keywords(post)
    keywords.append(result['keywords'])
    texts.append(result['text'])

In [10]:
raw_df['PostText'].values[0]

"Text: Nobody can read all the questions and answers that are posted here, so in this thread we invite you to share anything you'd like to highlight from the last week - an interesting discussion, an informative answer, an insightful question that was overlooked, or anything else."

In [11]:
df = pd.DataFrame({'text': texts, 'keywords': keywords})
df.head()

Unnamed: 0,text,keywords
0,Nobody can read all the questions and answers ...,"read questions answers posted , thread invite ..."
1,( Please Be Aware: We expect everyone to read ...,( Aware : expect read rules guidelines thread .
2,Mods will remove questions which we deem to be...,Mods remove questions deem involved theme place .
3,We will remove answers which don't include a s...,remove answers include source .
4,These removals will be without notice.,removals notice .


In [12]:
df.shape

(77757, 2)

In [13]:
df.to_parquet(f'{cwd}/subreddits_train_data_keywords.parquet')

In [8]:
df = pd.read_parquet(f'{cwd}/subreddits_train_data_keywords.parquet')
df.head()

Unnamed: 0,text,keywords
0,Nobody can read all the questions and answers ...,"read questions answers posted , thread invite ..."
1,( Please Be Aware: We expect everyone to read ...,( Aware : expect read rules guidelines thread .
2,Mods will remove questions which we deem to be...,Mods remove questions deem involved theme place .
3,We will remove answers which don't include a s...,remove answers include source .
4,These removals will be without notice.,removals notice .


In [9]:
df['text'].values[2]

'Mods will remove questions which we deem to be too involved for the theme in place here.'

In [10]:
df['keywords'].values[2]

'Mods remove questions deem involved theme place .'

In [11]:
X = df['keywords']
y = df['text']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [13]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
X_train.values[0]

'translation language book translated english like , read ?'

In [15]:
y_train.values[0]

'But if there is a translation in my language and the book itself is also already translated to english like the last two, which should I read?'

In [16]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to(device)
max_source_length = 128
max_target_length = 128
task_prefix = "generate: "
input_sequences = [task_prefix + text for text in X_train.values]
encoding = tokenizer(
 input_sequences,
 padding="longest",
 max_length=max_source_length,
 truncation=True,
 return_tensors="pt",
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
target_encoding = tokenizer(
 list(y_train.values),
 padding="longest",
 max_length=max_target_length,
 truncation=True,
 return_tensors="pt",
)

In [19]:
labels = target_encoding.input_ids.to(device)
labels[labels == tokenizer.pad_token_id] = -100
train_dataset = TensorDataset(input_ids, attention_mask, labels)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4
num_epochs = 4
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
    print(f'Started with epoch: {epoch}')
    for i, (input_ids_batch, attention_mask_batch, labels_batch) in enumerate(train_loader):
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        labels_batch = labels_batch.to(device)

        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss

        loss = loss/accumulation_steps

        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

Started with epoch: 0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Started with epoch: 1
Started with epoch: 2
Started with epoch: 3


In [20]:
loss.item()

0.22307278215885162

In [21]:
i = 6
shuffled_text = X_test.values[i]
shuffled_text

'prescription strength iron deal symptom .'

In [22]:
y_test.values[i]

'And I already have prescription strength iron to deal with that symptom.'

In [23]:
shuffled_text = f"generate: {shuffled_text}"
input_ids = tokenizer(shuffled_text, return_tensors="pt").input_ids.to(device)
output = model.generate(input_ids, max_length=400)
tokenizer.decode(output[0], skip_special_tokens=True)

'My prescription is to take the iron and deal with this symptom.'

In [24]:
answers_fine_tuned = []
for ind in range(10):
    test_text = X_test.values[ind]
    test_text = f"generate: {test_text}"
    input_ids = tokenizer(test_text, return_tensors="pt").input_ids.to(device)
    output = model.generate(input_ids, max_length=50)
    answers_fine_tuned.append(tokenizer.decode(output[0], skip_special_tokens=True))

    print('Ground truth:', y_test.values[ind])
    print('Prediction (fine tuned model):', answers_fine_tuned[ind])
    print('Keywords:', X_test.values[ind])
    print('\n')
    print('***'*20)

Ground truth: No one cares.
Prediction (fine tuned model): Who cares.
Keywords: cares .


************************************************************
Ground truth: liberty also makes sense to me- human beings naturally think and make decisions independently, that can’t be taken from you.
Prediction (fine tuned model): liberty makes sense to me- human beings naturally to think of these decisions independently, and not how it is taken.
Keywords: liberty makes sense me- human beings naturally think decisions independently , taken .


************************************************************
Ground truth: Some well known specimens she worked on include Jane the juvenile Tyrannosaurus rex and Dakota the Edmontosaurus sp.
Prediction (fine tuned model): Some of the other specimens I've worked on include Jane's juvenile Tyrannosaurus rex and Dakota's Edmontosaurus sp.
Keywords: known specimens worked include Jane juvenile Tyrannosaurus rex Dakota Edmontosaurus sp .


**********************

In [None]:
!huggingface-cli login

In [26]:
token = HfFolder.get_token()
if token is None:
  raise ValueError("You must be logged into the Hugging Face CLI")

In [27]:
repo_name = "**"
username = '**'

repo_path = os.path.join(username, repo_name)

# create_repo(repo_path, token=HfFolder.get_token(), exist_ok=True)

In [28]:
repo = Repository(repo_path, clone_from=f"{repo_path}", use_auth_token=True)
model.save_pretrained(repo_path)
tokenizer.save_pretrained(repo_path)
repo.push_to_hub(commit_message="sentence level, n_epochs = 4")

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/saurabhkumar3400/SubredditSummarizer into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/231M [00:00<?, ?B/s]

Download file spiece.model:   2%|1         | 14.6k/773k [00:00<?, ?B/s]

Clean file spiece.model:   0%|          | 1.00k/773k [00:00<?, ?B/s]

Clean file model.safetensors:   0%|          | 1.00k/231M [00:00<?, ?B/s]

Upload file model.safetensors:   0%|          | 1.00/231M [00:00<?, ?B/s]

To https://huggingface.co/saurabhkumar3400/SubredditSummarizer
   fe81866..3fbf046  main -> main

   fe81866..3fbf046  main -> main



'https://huggingface.co/saurabhkumar3400/SubredditSummarizer/commit/3fbf046271791564426bd0bc7a8045e56051c128'

In [29]:
model_repo = os.path.join(username, repo_name)
dwnld_tokenizer = T5Tokenizer.from_pretrained(model_repo)
dwnld_model = T5ForConditionalGeneration.from_pretrained(model_repo)
dwnld_model = dwnld_model.to(device)

In [31]:
X_test.shape

(15552,)

In [30]:
answers_fine_tuned = []
batch_size = 100
for test_text in X_test.values[:batch_size]:
    test_text = f"generate: {test_text}"
    input_ids = dwnld_tokenizer(test_text, return_tensors="pt").input_ids.to(device)
    output = dwnld_model.generate(input_ids, max_length=50, num_return_sequences=1)
    answers_fine_tuned.append(dwnld_tokenizer.decode(output[0], skip_special_tokens=True))

In [32]:
rouge_metric = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [33]:
def print_rouge(rouge_n):

  fine_tuned_rouge1_prec = [rouge_metric.score(y_test.values[ind], answers_fine_tuned[ind])[rouge_n].precision for ind in range(batch_size)]

  fine_tuned_rouge1_recall = [rouge_metric.score(y_test.values[ind], answers_fine_tuned[ind])[rouge_n].recall for ind in range(batch_size)]

  fine_tuned_rouge1_fscore = [rouge_metric.score(y_test.values[ind], answers_fine_tuned[ind])[rouge_n].fmeasure for ind in range(batch_size)]

  print('Fine-tuned model (precision):', np.mean(fine_tuned_rouge1_prec))
  print('\n')
  print('Fine-tuned model (recall):', np.mean(fine_tuned_rouge1_recall))
  print('\n')
  print('Fine-tuned model (fscore):', np.mean(fine_tuned_rouge1_fscore))

In [34]:
print_rouge('rouge1')

Fine-tuned model (precision): 0.6992120975794813


Fine-tuned model (recall): 0.6373942473036804


Fine-tuned model (fscore): 0.6550842682990017


In [35]:
print_rouge('rougeL')

Fine-tuned model (precision): 0.6729445933935149


Fine-tuned model (recall): 0.614904025656417


Fine-tuned model (fscore): 0.6311746002255095


In [36]:
weights = (0.25, 0.25, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram

fine_tuned_bleu = [sentence_bleu([y_test.values[ind].split()], answers_fine_tuned[ind].split(), weights=weights, smoothing_function=SmoothingFunction().method1) for ind in range(batch_size)]
print('Fine-tuned model:', np.mean(fine_tuned_bleu))

Fine-tuned model: 0.5552340753407367


In [37]:
# BERTScore calculation
bert_scorer = BERTScorer(model_type='bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [38]:
bert_scores = [bert_scorer.score([answers_fine_tuned[ind]], [y_test.values[ind]]) for ind in range(batch_size)]
precision = [bert_scores[ind][0] for ind in range(batch_size)]
recall = [bert_scores[ind][1] for ind in range(batch_size)]
f1 = [bert_scores[ind][2] for ind in range(batch_size)]
print('Fine-tuned model (precision):', np.mean(precision))
print('Fine-tuned model (recall):', np.mean(recall))
print('Fine-tuned model (fscore):', np.mean(f1))



Fine-tuned model (precision): 0.7736678
Fine-tuned model (recall): 0.7440519
Fine-tuned model (fscore): 0.75637937


In [None]:
space = {
 'learning_rate': hp.loguniform('learning_rate', -2, -1),
 'per_device_train_batch_size': hp.choice( 'per_device_train_batch_size', [8, 16, 32, 64]),
 'num_train_epochs': hp.choice('num_train_epochs', [1, 2, 3]),
 'weight_decay': hp.uniform('weight_decay', 0.0, 0.3),
}

In [None]:
train_data = pd.DataFrame({'text': X_train.values, 'labels': y_train.values})
train_data.head()

Unnamed: 0,text,labels
0,I was recently arguing with someone about brut...,What counts as a “sufficient” reason?
1,Forgive the terminology (been a while since I ...,Are people with AB+ blood (potentially) subjec...
2,"No dialogue or one liner, just shot him in the...",[DC] How would Joker react if someone just sho...
3,Back story: I (28 AMAB) Have been heavily ques...,Faceapp gender swap
4,Am I the only one noticing this? Whenever it’s...,Why is it that there are a lot of 10/10 black ...


In [None]:
test_data = pd.DataFrame({'text': X_test.values, 'labels': y_test.values})
test_data.head()

Unnamed: 0,text,labels
0,\n\nI'm about halfway through conscious mind a...,Question on chalmers conscious mind
1,"Hey all, if you have any idea what's going on,...",Fungal Balanitis -> Urethritis + Inflammed/Raw...
2,Basically the title. I have a small Dremel-lik...,A Dremel-like tool equipped with an 18V motor ...
3,"With all the wealth in this nation, why can’t ...",Homelessness
4,So I recently got into making cold foams to pu...,Need cold foam advice


In [None]:
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [None]:
train_encodings = dwnld_tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = dwnld_tokenizer(list(X_test.values), truncation=True, padding=True)

train_label_encodings = dwnld_tokenizer(list(y_train.values), truncation=True, padding=True)
test_label_encodings = dwnld_tokenizer(list(y_test.values), truncation=True, padding=True)

In [None]:
train_dataset = RedditDataset(train_encodings, train_label_encodings)
test_dataset = RedditDataset(test_encodings, test_label_encodings)

In [None]:
def objective(hyperparams):
    model_repo = os.path.join(username, repo_name)
    dwnld_tokenizer = T5Tokenizer.from_pretrained(model_repo)
    dwnld_model = T5ForConditionalGeneration.from_pretrained(model_repo)
    dwnld_model = dwnld_model.to(device)

    training_args = TrainingArguments(
        output_dir=model_repo,
        learning_rate=hyperparams['learning_rate'],
        per_device_train_batch_size=hyperparams['per_device_train_batch_size'],
        num_train_epochs=hyperparams['num_train_epochs'],
        weight_decay=hyperparams['weight_decay'],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to=None
    )

    trainer = Trainer(
        model=dwnld_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return {'loss': eval_results["eval_loss"], 'status': STATUS_OK}

In [None]:
pip install wandb

In [None]:
wandb login

In [None]:
best = fmin(
    fn=objective,
    space=space,
    max_evals=3,
    trials=Trials()
)



  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

Epoch,Training Loss,Validation Loss
1,2.0327,1.810018
2,1.8436,1.795385
3,1.6576,1.676672


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


 33%|███▎      | 1/3 [05:16<10:32, 316.18s/trial, best loss: 1.6766717433929443]

Epoch,Training Loss,Validation Loss
1,No log,1.976572
2,No log,1.687433
3,No log,1.655196


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


 67%|██████▋   | 2/3 [09:57<04:55, 295.92s/trial, best loss: 1.6551960706710815]

Epoch,Training Loss,Validation Loss
1,2.0471,1.840095


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


100%|██████████| 3/3 [11:49<00:00, 236.64s/trial, best loss: 1.6551960706710815]
