In [None]:
pip install rouge-score bert-score transformers hyperopt spacy-ngram

In [None]:
import os
import pandas as pd
import re
import random
import spacy
from sklearn.model_selection import train_test_split
import numpy as np
import copy

from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import cuda, tensor
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.optim as optim
from huggingface_hub import HfFolder, Repository, create_repo

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import BertTokenizer, BertModel
from bert_score import BERTScorer

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from transformers import Trainer, TrainingArguments

# Data Processing

## Import raw data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cwd = '/content/drive/MyDrive/LLMs'
cwd

'/content/drive/MyDrive/LLMs'

In [None]:
filename = f'{cwd}/subreddits_train_data_2.parquet'
raw_df = pd.read_parquet(filename)
raw_df.head()

Unnamed: 0,PostText
0,Text: Another home on the eastern plains of Co...
1,Text: Colorado stopped using state Medicaid fu...
2,Text: Here’s one of many that are now abandoned-
3,Text: More Blossoms & Bandos.
4,Text: This house has a very interesting history.


In [None]:
raw_df['PostText'].values[0]

'Text: Another home on the eastern plains of Colorado left to time- '

## Pre-processing

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def remove_stopwords(text):
    keywords = []
    for item in nlp(text):
        if not item.is_stop:
            keywords.append(item.text)
    keywords_para = ' '.join(keywords)

    return keywords_para

def extract_keywords(text):
    try:
        split_text = text.split("Text: ")
        text = split_text[1]
        keywords_para = remove_stopwords(text)

        return {
        "text": text,
        "keywords": keywords_para
        }
    except:
        return {'text': '', 'keywords': ''}

In [None]:
keywords = []
texts = []
for post in raw_df['PostText'].values:
    result = extract_keywords(post)
    keywords.append(result['keywords'])
    texts.append(result['text'])

In [None]:
raw_df['PostText'].values[0]

'Text: Another home on the eastern plains of Colorado left to time- '

In [None]:
df = pd.DataFrame({'text': texts, 'keywords': keywords})
df.head()

Unnamed: 0,text,keywords
0,Another home on the eastern plains of Colorado...,home eastern plains Colorado left time-
1,Colorado stopped using state Medicaid funds on...,Colorado stopped state Medicaid funds resident...
2,Here’s one of many that are now abandoned-,abandoned-
3,More Blossoms & Bandos.,Blossoms & Bandos .
4,This house has a very interesting history.,house interesting history .


In [None]:
df.shape

(85762, 2)

In [None]:
df.to_parquet(f'{cwd}/subreddits_train_data_2_keywords.parquet')

In [None]:
df = pd.read_parquet(f'{cwd}/subreddits_train_data_keywords.parquet')
df2 = pd.read_parquet(f'{cwd}/subreddits_train_data_2_keywords.parquet')
df.head()

Unnamed: 0,text,keywords
0,Nobody can read all the questions and answers ...,"read questions answers posted , thread invite ..."
1,( Please Be Aware: We expect everyone to read ...,( Aware : expect read rules guidelines thread .
2,Mods will remove questions which we deem to be...,Mods remove questions deem involved theme place .
3,We will remove answers which don't include a s...,remove answers include source .
4,These removals will be without notice.,removals notice .


In [None]:
df2.tail()

Unnamed: 0,text,keywords
85757,Would love tips on YouTubers that are trustwor...,love tips YouTubers trustworthy .
85758,,
85759,"Since yall are expecting a big black Monday, W...","y expecting big black Monday , y sell spot use..."
85760,Edit : I timed the market perfectly with my p...,Edit : timed market perfectly post .
85761,"If you opened a short at the time I posted, yo...","opened short time posted , rich"


In [None]:
df_merged = pd.concat([df, df2], ignore_index=True, sort=False)
df_merged = df_merged[df_merged['text'] != '']
df_merged = df_merged[df_merged['keywords'] != '']
df_merged.head()

Unnamed: 0,text,keywords
0,Nobody can read all the questions and answers ...,"read questions answers posted , thread invite ..."
1,( Please Be Aware: We expect everyone to read ...,( Aware : expect read rules guidelines thread .
2,Mods will remove questions which we deem to be...,Mods remove questions deem involved theme place .
3,We will remove answers which don't include a s...,remove answers include source .
4,These removals will be without notice.,removals notice .


In [None]:
df_merged.tail()

Unnamed: 0,text,keywords
163513,I have used youtube to learn the basics and ho...,"youtube learn basics works , comes objective c..."
163514,Would love tips on YouTubers that are trustwor...,love tips YouTubers trustworthy .
163516,"Since yall are expecting a big black Monday, W...","y expecting big black Monday , y sell spot use..."
163517,Edit : I timed the market perfectly with my p...,Edit : timed market perfectly post .
163518,"If you opened a short at the time I posted, yo...","opened short time posted , rich"


In [None]:
df['text'].values[2]

'Mods will remove questions which we deem to be too involved for the theme in place here.'

In [None]:
df['keywords'].values[2]

'mod remove, remove question, question deem, deem involved, involved theme, theme place, '

In [None]:
df_merged.to_parquet(f'{cwd}/subreddits_train_data_merged.parquet')

In [None]:
df = pd.read_parquet(f'{cwd}/subreddits_train_data_merged.parquet')
df.head()

Unnamed: 0,text,keywords
0,Nobody can read all the questions and answers ...,"read questions answers posted , thread invite ..."
1,( Please Be Aware: We expect everyone to read ...,( Aware : expect read rules guidelines thread .
2,Mods will remove questions which we deem to be...,Mods remove questions deem involved theme place .
3,We will remove answers which don't include a s...,remove answers include source .
4,These removals will be without notice.,removals notice .


In [None]:
df.shape

(158650, 2)

In [None]:
X = df['keywords']
y = df['text']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
X_train.values[2]

'  EDIT BIG thankyou provided suggestions frugal paper towels .'

In [None]:
y_train.values[2]

' EDIT A BIG thankyou to everyone who has provided some suggestions about being more frugal with my paper towels.'

# Model Building

## Model Training

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
!huggingface-cli login

In [None]:
token = HfFolder.get_token()
if token is None:
  raise ValueError("You must be logged into the Hugging Face CLI")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# hugging face parameters
repo_name = "TextGeneratorLargeData"
username = 'saurabhkumar3400'

repo_path = os.path.join(username, repo_name)

# create_repo(repo_path, token=HfFolder.get_token(), exist_ok=True)

In [None]:
# download tokenizer
model_repo = os.path.join(username, repo_name)
dwnld_tokenizer = T5Tokenizer.from_pretrained(model_repo)
dwnld_model = T5ForConditionalGeneration.from_pretrained(model_repo)
dwnld_model = dwnld_model.to(device)

In [None]:
# tokenizing training data
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to(device)
max_source_length = 128
max_target_length = 128
task_prefix = "generate: "
input_sequences = [task_prefix + text for text in X_train.values]
encoding = tokenizer(
 input_sequences,
 padding="longest",
 max_length=max_source_length,
 truncation=True,
 return_tensors="pt",
)

# extracting training tokens and attention mask
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [None]:
# tokenizing ground truth
target_encoding = tokenizer(
 list(y_train.values),
 padding="longest",
 max_length=max_target_length,
 truncation=True,
 return_tensors="pt",
)

# extracting ground truth tokens
labels = target_encoding.input_ids.to(device)
labels[labels == tokenizer.pad_token_id] = -100

In [None]:
# tokenizing validation data
test_data_encoding = tokenizer(
 list(X_test.values[:100]),
 padding="longest",
 max_length=max_source_length,
 truncation=True,
 return_tensors="pt",
)

# extracting validation tokens and attention mask
test_input_ids, test_attention_mask = test_data_encoding.input_ids, test_data_encoding.attention_mask
test_input_ids = test_input_ids.to(device)
test_attention_mask = test_attention_mask.to(device)

In [None]:
# tokenizing validation ground truth
test_target_encoding = tokenizer(
 list(y_test.values[:100]),
 padding="longest",
 max_length=max_target_length,
 truncation=True,
 return_tensors="pt",
)

test_labels = test_target_encoding.input_ids.to(device)

In [None]:
best_loss = float('inf')
best_model_weights = None
patience = 2

train_dataset = TensorDataset(input_ids, attention_mask, labels)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4
num_epochs = 1
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
    print(f'Started with epoch: {epoch}')
    for i, (input_ids_batch, attention_mask_batch, labels_batch) in enumerate(train_loader):
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        labels_batch = labels_batch.to(device)

        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss

        loss = loss/accumulation_steps

        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    # Model evaluation
    model.eval()
    with torch.no_grad():
        test_outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask, labels=test_labels)
        test_loss = test_outputs.loss

    print(f'Epoch no: {epoch}, test loss = {test_loss}, best loss = {best_loss}')
    # Early stopping
    if test_loss < best_loss:
        best_loss = test_loss
        best_model_weights = copy.deepcopy(model.state_dict())
        patience_counter = patience
    else:
        patience_counter -= 1
        if patience_counter == 0:
            break

# Load the best model weights
model.load_state_dict(best_model_weights)

Started with epoch: 0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch no: 0, test loss = 17.076032638549805, best loss = inf


<All keys matched successfully>

In [None]:
i = 6
shuffled_text = X_test.values[i]
shuffled_text

'2 3 occasions try totalling 8 minutes .'

In [None]:
y_test.values[i]

'On 2 or 3 occasions he did try to top all totalling to be less than 8 minutes.'

In [None]:
shuffled_text = f"generate: {shuffled_text}"
input_ids = tokenizer(shuffled_text, return_tensors="pt").input_ids.to(device)
output = model.generate(input_ids, max_length=400)
tokenizer.decode(output[0], skip_special_tokens=True)

'I have 2 more occasions to try totalling 8 minutes.'

In [None]:
answers_fine_tuned = []
start = 20
for ind in range(10):
    test_text = X_test.values[start+ind]
    test_text = f"generate: {test_text}"
    input_ids = tokenizer(test_text, return_tensors="pt").input_ids.to(device)
    output = model.generate(input_ids, max_length=50)
    answers_fine_tuned.append(tokenizer.decode(output[0], skip_special_tokens=True))

    print('Ground truth:', y_test.values[start+ind])
    print('Prediction (fine tuned model):', answers_fine_tuned[ind])
    print('Keywords:', X_test.values[start+ind])
    print('\n')
    print('***'*20)

Ground truth: when i thought it wasnt and i wanted privacy we stopped and we didn't do it anymore.
Prediction (fine tuned model): I thought he didnt wanted to keep my privacy stopped anymore.
Keywords: thought nt wanted privacy stopped anymore .


************************************************************
Ground truth:  Although I am thinking of buying the QCR version.
Prediction (fine tuned model): I was thinking of buying the QCR version.
Keywords:   thinking buying QCR version .


************************************************************
Ground truth: It’s like it’s waking you up and putting you back to sleep at the same time.
Prediction (fine tuned model): It’s like waking up and putting myself to sleep all the time.
Keywords: like waking putting sleep time .


************************************************************
Ground truth: I recommend if you haven't been hit with a violation yet remove any bots you have from them and message Discord on the ongoing situation.
Predic

## Store best model

In [None]:
!huggingface-cli login

In [None]:
token = HfFolder.get_token()
if token is None:
  raise ValueError("You must be logged into the Hugging Face CLI")

In [None]:
repo_name = "TextGeneratorLargeData"
username = 'saurabhkumar3400'

repo_path = os.path.join(username, repo_name)

# create_repo(repo_path, token=HfFolder.get_token(), exist_ok=True)

RepoUrl('https://huggingface.co/saurabhkumar3400/TextGeneratorLargeData', endpoint='https://huggingface.co', repo_type='model', repo_id='saurabhkumar3400/TextGeneratorLargeData')

In [None]:
repo = Repository(repo_path, clone_from=f"{repo_path}", use_auth_token=True)
model.save_pretrained(repo_path)
tokenizer.save_pretrained(repo_path)
repo.push_to_hub(commit_message="early stopping")

In [None]:
model_repo = os.path.join(username, repo_name)
dwnld_tokenizer = T5Tokenizer.from_pretrained(model_repo)
dwnld_model = T5ForConditionalGeneration.from_pretrained(model_repo)
dwnld_model = dwnld_model.to(device)

In [None]:
X_test.shape

(31730,)

## Model Evaluation

In [None]:
answers_fine_tuned = []
batch_size = 1000
for test_text in X_test.values[:batch_size]:
    test_text = f"generate: {test_text}"
    input_ids = dwnld_tokenizer(test_text, return_tensors="pt").input_ids.to(device)
    output = dwnld_model.generate(input_ids, max_length=50, num_return_sequences=1)
    answers_fine_tuned.append(dwnld_tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
# BERTScore calculation
bert_scorer = BERTScorer(model_type='bert-base-uncased')

In [None]:
bert_scores = [bert_scorer.score([answers_fine_tuned[ind]], [y_test.values[ind]]) for ind in range(batch_size)]
precision = [bert_scores[ind][0] for ind in range(batch_size)]
recall = [bert_scores[ind][1] for ind in range(batch_size)]
f1 = [bert_scores[ind][2] for ind in range(batch_size)]
print('Fine-tuned model (precision):', np.mean(precision))
print('Fine-tuned model (recall):', np.mean(recall))
print('Fine-tuned model (fscore):', np.mean(f1))



Fine-tuned model (precision): 0.7739882
Fine-tuned model (recall): 0.73742616
Fine-tuned model (fscore): 0.75359714


In [None]:
rouge_metric = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [None]:
def print_rouge(rouge_n, ground_truth, predictions):

  fine_tuned_rouge1_prec = [rouge_metric.score(ground_truth.values[ind], predictions[ind])[rouge_n].precision for ind in range(batch_size)]
  fine_tuned_rouge1_recall = [rouge_metric.score(ground_truth.values[ind], predictions[ind])[rouge_n].recall for ind in range(batch_size)]
  fine_tuned_rouge1_fscore = [rouge_metric.score(ground_truth.values[ind], predictions[ind])[rouge_n].fmeasure for ind in range(batch_size)]

  print('Fine-tuned model (precision):', np.mean(fine_tuned_rouge1_prec))
  print('Fine-tuned model (recall):', np.mean(fine_tuned_rouge1_recall))
  print('Fine-tuned model (fscore):', np.mean(fine_tuned_rouge1_fscore))

In [None]:
print_rouge('rouge1', y_test, answers_fine_tuned)

Fine-tuned model (precision): 0.7226341167510321
Fine-tuned model (recall): 0.628486005009104
Fine-tuned model (fscore): 0.659984189393305


In [None]:
print_rouge('rougeL', y_test, answers_fine_tuned)

Fine-tuned model (precision): 0.694009454910966
Fine-tuned model (recall): 0.6035258145793055
Fine-tuned model (fscore): 0.6337879937744871


In [None]:
answers_fine_tuned_train = []
for train_text in X_train.values[:batch_size]:
    train_text = f"generate: {train_text}"
    input_ids = dwnld_tokenizer(train_text, return_tensors="pt").input_ids.to(device)
    output = dwnld_model.generate(input_ids, max_length=50, num_return_sequences=1)
    answers_fine_tuned_train.append(dwnld_tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
bert_scores = [bert_scorer.score([answers_fine_tuned_train[ind]], [y_train.values[ind]]) for ind in range(batch_size)]
precision = [bert_scores[ind][0] for ind in range(batch_size)]
recall = [bert_scores[ind][1] for ind in range(batch_size)]
f1 = [bert_scores[ind][2] for ind in range(batch_size)]
print('Fine-tuned model (precision):', np.mean(precision))
print('Fine-tuned model (recall):', np.mean(recall))
print('Fine-tuned model (fscore):', np.mean(f1))



Fine-tuned model (precision): 0.7989736
Fine-tuned model (recall): 0.76531875
Fine-tuned model (fscore): 0.7803467




In [None]:
print_rouge('rouge1', y_train, answers_fine_tuned_train)

Fine-tuned model (precision): 0.7471457182409611
Fine-tuned model (recall): 0.656958450029505
Fine-tuned model (fscore): 0.6880347117891051


In [None]:
print_rouge('rougeL', y_train, answers_fine_tuned_train)

Fine-tuned model (precision): 0.7175770688845862
Fine-tuned model (recall): 0.6354185941376314
Fine-tuned model (fscore): 0.6636220572913438


In [None]:
answers_fine_tuned_train = []
start = 10
for ind in range(10):
    train_text = X_train.values[start+ind]
    train_text = f"generate: {train_text}"
    input_ids = dwnld_tokenizer(train_text, return_tensors="pt").input_ids.to(device)
    output = dwnld_model.generate(input_ids, max_length=50)
    answers_fine_tuned_train.append(dwnld_tokenizer.decode(output[0], skip_special_tokens=True))

    print('Ground truth:', y_train.values[start+ind])
    print('Prediction (fine tuned model):', answers_fine_tuned_train[ind])
    print('Keywords:', X_train.values[start+ind])
    print('\n')
    print('***'*20)

Ground truth: I feel very embarrassed because I’m also a very athletic and in shape person.
Prediction (fine tuned model): I feel very embarrassed because I’m also a very athletic and in shape person.
Keywords: feel embarrassed athletic shape person .


************************************************************
Ground truth: Anyone know why?
Prediction (fine tuned model): Anyone know?
Keywords: know ?


************************************************************
Ground truth: Some other science thing?
Prediction (fine tuned model): Some other science thing?
Keywords: science thing ?


************************************************************
Ground truth: I moved to the UK couple of months ago and honestly I don't really know what I am doing with my life, it was my childhood dream to move to silicon valley Honeslty I have no idea what I am doing with my life, dysphoria is cripling me and only coping mechanisim I have is drowning myself in work and in alcohol.
Prediction (fine tun

In [None]:
weights = (0.25, 0.25, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram

fine_tuned_bleu = [sentence_bleu([y_test.values[ind].split()], answers_fine_tuned[ind].split(), weights=weights, smoothing_function=SmoothingFunction().method1) for ind in range(batch_size)]
print('Fine-tuned model:', np.mean(fine_tuned_bleu))

Fine-tuned model: 0.51023058641632


# Fine tuning: hyperparameter search

In [None]:
space = {
 'learning_rate': hp.loguniform('learning_rate', -2, -1),
 'per_device_train_batch_size': hp.choice( 'per_device_train_batch_size', [8, 16, 32, 64]),
 'num_train_epochs': hp.choice('num_train_epochs', [1, 2, 3]),
 'weight_decay': hp.uniform('weight_decay', 0.0, 0.3),
}

In [None]:
train_data = pd.DataFrame({'text': X_train.values, 'labels': y_train.values})
train_data.head()

In [None]:
test_data = pd.DataFrame({'text': X_test.values, 'labels': y_test.values})
test_data.head()

In [None]:
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [None]:
train_encodings = dwnld_tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = dwnld_tokenizer(list(X_test.values), truncation=True, padding=True)

train_label_encodings = dwnld_tokenizer(list(y_train.values), truncation=True, padding=True)
test_label_encodings = dwnld_tokenizer(list(y_test.values), truncation=True, padding=True)

In [None]:
train_dataset = RedditDataset(train_encodings, train_label_encodings)
test_dataset = RedditDataset(test_encodings, test_label_encodings)

In [None]:
def objective(hyperparams):
    model_repo = os.path.join(username, repo_name)
    dwnld_tokenizer = T5Tokenizer.from_pretrained(model_repo)
    dwnld_model = T5ForConditionalGeneration.from_pretrained(model_repo)
    dwnld_model = dwnld_model.to(device)

    training_args = TrainingArguments(
        output_dir=model_repo,
        learning_rate=hyperparams['learning_rate'],
        per_device_train_batch_size=hyperparams['per_device_train_batch_size'],
        num_train_epochs=hyperparams['num_train_epochs'],
        weight_decay=hyperparams['weight_decay'],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to=None
    )

    trainer = Trainer(
        model=dwnld_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return {'loss': eval_results["eval_loss"], 'status': STATUS_OK}

In [None]:
pip install wandb

In [None]:
wandb login

In [None]:
best = fmin(
    fn=objective,
    space=space,
    max_evals=3,
    trials=Trials()
)