### **IMPORTS**

---

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install nltk
!pip install rouge_score

In [125]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, DataCollatorWithPadding, DataCollatorForLanguageModeling, AdamW, get_linear_schedule_with_warmup
from datasets import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from transformers import get_scheduler, Trainer, TrainingArguments, pipeline, DataCollatorForSeq2Seq
import torch.nn as nn
import time
import random
from sklearn.model_selection import train_test_split
import evaluate
import gc
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

In [126]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**INITIAL PERFORMANCE**

---



---



In [123]:
prompt="""given a review classify its sentiment as 0 or 1

review: It is a bad sunny day
"""
tokenized_prompt=tokenizer(prompt,return_tensors='pt')
generation_config=GenerationConfig(
    temperature=0.1,
    max_new_tokens=5,
    num_beams=1,
    do_sample=False
)
output=model.generate(
    input_ids=tokenized_prompt.input_ids.to(0),
    generation_config=generation_config,
    return_dict_in_generate=True,
    pad_token_id=model.config.pad_token_id,
    repetition_penalty=1.2
)
out=output.sequences
for i in out:
  print(tokenizer.decode(i,skip_special_tokens=True))

1


### **Data Genertion**

---

In [35]:
df=pd.read_csv('IMDB Dataset.csv')

In [35]:
# df=pd.read_excel('sampled_sentiment.xlsx')

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,16144,In my opinion this is the best Oliver Stone fl...,positive
1,47437,This is by far the worst non-English horror mo...,negative
2,18910,"with very little screen time and money, Dan Ka...",positive
3,8146,"Were I not with friends, and so cheap, I would...",negative
4,9435,For pure gothic vampire cheese nothing can com...,positive


In [37]:
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [38]:
df=df[df['review'].str.len()<200]

In [39]:
len(df)

200

In [40]:
df=df.sample(n=200)

In [41]:
df['sentiment']=df['sentiment'].replace(['positive'],"1")
df['sentiment']=df['sentiment'].replace(['negative'],"0")

In [42]:
df.sentiment.unique()

array(['0', '1'], dtype=object)

In [43]:
prompt="""given a review classify its sentiment as 0 or 1

review: """

In [44]:
def add_prompt(x):
  final_prompt=prompt+x['review']
  x['prompt']=final_prompt
  return x

In [45]:
df=df.apply(lambda x: add_prompt(x),axis=1)

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,prompt
188,48903,"An unfunny, unworthy picture which is an undes...",0,given a review classify its sentiment as 0 or ...
97,14829,My first thoughts on this film were of using s...,0,given a review classify its sentiment as 0 or ...
126,27521,"Read the book, forget the movie!",0,given a review classify its sentiment as 0 or ...
120,8200,"If it wasn't for the terrific music, I would n...",0,given a review classify its sentiment as 0 or ...
106,34874,"Brilliant kung-fu scenes, loads of melodrama, ...",1,given a review classify its sentiment as 0 or ...


In [47]:
print(df['prompt'].to_list()[1])

given a review classify its sentiment as 0 or 1

review: My first thoughts on this film were of using science fiction as a bad way to show naked women, althought not a brilliant story line it had quite a good ending


In [48]:
df=df.rename(columns={'sentiment':'output', 'prompt':'input'})

In [49]:
df.to_excel('final_dataset.xlsx')

In [25]:
df=pd.read_excel('final_dataset.xlsx')

In [50]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

### **TRAINING**

---



---



In [108]:
tokenizer=AutoTokenizer.from_pretrained("google/flan-t5-large")

In [52]:
df_train,df_test=train_test_split(df,test_size=0.1)

In [54]:
def preprocess_function(sample):
    model_inputs = tokenizer(sample['input'], max_length=60, padding='max_length', truncation=True)
    labels = tokenizer(text_target=sample["output"], max_length=5, padding='max_length', truncation=True)
    labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [55]:
train_dataset=Dataset.from_pandas(df_train)
test_dataset=Dataset.from_pandas(df_test)

In [56]:
train_tokenized_dataset=train_dataset.map(preprocess_function, batched=True, remove_columns=["__index_level_0__","input","output"])
test_tokenized_dataset=test_dataset.map(preprocess_function, batched=True, remove_columns=["__index_level_0__","input","output"])

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [58]:
len(train_tokenized_dataset[1]['input_ids'])

60

In [59]:
model=AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-large',device_map="auto")

In [60]:
!nvidia-smi

Sun May 28 03:13:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    27W /  70W |   4007MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [61]:
print(model.get_memory_footprint())

3132600320


In [62]:
metric=evaluate.load('rouge')

In [63]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [64]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [65]:
training_args = Seq2SeqTrainingArguments(
    output_dir="flan",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    fp16=False,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_dir=f"flan_logs",
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="tensorboard",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    compute_metrics=compute_metrics,
)

In [66]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0988,0.581937,90.0,0.0,90.0,90.0,2.45
2,0.1093,0.491127,90.0,0.0,90.0,90.0,2.45
3,0.0194,0.602103,90.0,0.0,90.0,90.0,2.45
4,0.0828,0.580385,90.0,0.0,90.0,90.0,2.45
5,0.001,0.587817,90.0,0.0,90.0,90.0,2.45


TrainOutput(global_step=900, training_loss=0.06120340941680802, metrics={'train_runtime': 841.1126, 'train_samples_per_second': 1.07, 'train_steps_per_second': 1.07, 'total_flos': 259286394470400.0, 'train_loss': 0.06120340941680802, 'epoch': 5.0})

### **PREDICTION**

---


In [67]:
torch.cuda.empty_cache()

In [76]:
from transformers import GenerationConfig

In [68]:
model=AutoModelForSeq2SeqLM.from_pretrained('flan/checkpoint-900',device_mpa="auto")

In [121]:
# model=AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-large',device_map="auto")

In [118]:
prompt="""given a review classify its sentiment as 0 or 1

review: It is a bad sunny day
"""
tokenized_prompt=tokenizer(prompt,return_tensors='pt')
generation_config=GenerationConfig(
    temperature=0.1,
    max_new_tokens=5,
    num_beams=1,
    do_sample=False
)
output=model.generate(
    input_ids=tokenized_prompt.input_ids.to(0),
    generation_config=generation_config,
    return_dict_in_generate=True,
    pad_token_id=model.config.pad_token_id,
    repetition_penalty=1.2
)
out=output.sequences
for i in out:
  print(tokenizer.decode(i,skip_special_tokens=True))