In [1]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import os
from datasets import load_dataset,load_metric
# from evaluate import load_metric
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Samsum dataset
dataset = load_dataset("samsum")

train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

Found cached dataset samsum (/Users/sanjanajd/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)
100%|██████████| 3/3 [00:00<00:00, 123.66it/s]


In [5]:
# def summarize(tokenizer,model,text):
#     inputs = tokenizer(f"Summarize dialogue >>\n {text}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)

#     # Generate summary
#     summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
# #     print(len(summary_ids[0]))
#     # Decode the summary
#     summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
#     return summary
def summarize(tokenizer,model,text):
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs = inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary


generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []
#SAVED_MODEL_PATH = '/Users/sanjanajd/Desktop/Sanjana/NLP/Project/Bart_base_emoji/bart_base_full_finetune_emoji_save'
SAVED_TOK_PATH = '/Users/sanjanajd/Desktop/Sanjana/NLP/Project/flant5-base-lora/tokenizer-emoji_t5'
SAVED_MODEL_TOK = AutoTokenizer.from_pretrained(SAVED_TOK_PATH)#.to(device)
#SAVED_MODEL = AutoModelForSeq2SeqLM.from_pretrained(SAVED_MODEL_PATH).to(device)



peft_model_id = "/Users/sanjanajd/Desktop/Sanjana/NLP/Project/flant5-base-lora/flan_t5_base_lora_finetune_emoji_save_adapter"
config = PeftConfig.from_pretrained(peft_model_id)#.to(device)
combined_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path).to(device)
combined_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
combined_model = PeftModel.from_pretrained(combined_model, peft_model_id).to(device)
combined_model.resize_token_embeddings(len(SAVED_MODEL_TOK))


load_16_bit=True
model_name = "google/flan-t5-base"
if load_16_bit:
    orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.float16).to(device)
else:
    orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

orignal_model_tok = AutoTokenizer.from_pretrained(model_name)

text = """Sarah: Good morning, everyone. Let's dive straight into today's agenda. 
First up, progress on the marketing campaign. Dan, could you give us an update?
Dan: Sure, Sarah. We've finalized the campaign strategy and are currently in the 
process of creating the content calendar. We're on track to launch by the end of next month.
Sarah: Great to hear. Any roadblocks or concerns from your end?
Dan: Not at the moment. We're working closely with the creative team to ensure everything 
aligns with our objectives.
Sarah: Excellent. Next item on the agenda, the upcoming product launch. James, how are we 
looking on that front?
James: Things are shaping up well. We've conducted beta testing and received positive feedback. 
The final adjustments are being made, and we're gearing up for a successful launch next week.
Sarah: Fantastic news. Let's make sure all departments are aligned for a smooth rollout."""

####
summarize(SAVED_MODEL_TOK,combined_model,text)

model_name = "facebook/bart-base"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


In [6]:
print(summarize(SAVED_MODEL_TOK,combined_model,text))

['Sarah and Dan are discussing progress on the marketing campaign. Dan has finalized the campaign strategy and is in the process of creating the content calendar. The next item is the product launch. James has conducted beta testing and received positive feedback.']


In [12]:
text = """Sarah: Good morning, everyone. Let's dive straight into today's agenda. 
First up, progress on the marketing campaign. Dan, could you give us an update?
Dan: Sure, Sarah. We've finalized the campaign strategy and are currently in the 
process of creating the content calendar. We're on track to launch by the end of next month.
Sarah: Great to hear. Any roadblocks or concerns from your end?
Dan: Not at the moment. We're working closely with the creative team to ensure everything 
aligns with our objectives.
Sarah: Excellent. Next item on the agenda, the upcoming product launch. James, how are we 
looking on that front?
James: Things are shaping up well. We've conducted beta testing and received positive feedback. 
The final adjustments are being made, and we're gearing up for a successful launch next week.
Sarah: Fantastic news. Let's make sure all departments are aligned for a smooth rollout."""
print(summarize(SAVED_MODEL_TOK,SAVED_MODEL,text))
print(summarize(orignal_model_tok,orignal_model,text))

['Sarah and Dan are working closely with the creative team on the marketing campaign. They are on track to launch by the end of next month. James is conducting beta testing and received positive feedback. ']
["Summarize dialogue >> Sarah: Good morning, everyone. Let's dive straight into today's agenda. __________________________________________________________First up, progress on the marketing campaign. Dan, could you give us an update? __________________________________________Dan: Sure, Sarah. We've finalized the campaign strategy and are currently in the __________________________________________________process of creating the content calendar. We're on track to launch by the end of next month. __________Sarah: Great to hear."]


In [8]:
text = """
Emily: Good morning, team. Let's start by reviewing the progress on the client project. 
Tom, could you give us an update?
Tom: Certainly, Emily. We've completed the initial research phase and are now moving into the design stage. 
Feedback from the client has been positive so far, and we're confident in meeting the project deadline.
Emily: That's great to hear, Tom. Keep up the good work. Next on the agenda, budget allocation for the 
upcoming quarter. Lisa, what's the status on that?
Lisa: We've analyzed the financial data from the previous quarter and have identified areas for optimization. 
I'll be circulating a proposed budget plan by the end of the day for everyone's review.
Emily: Excellent. Let's ensure we allocate resources wisely to maximize our ROI. Moving on, I'd like 
to discuss the recent market trends. Mike, any insights to share?
Mike: Yes, Emily. Our market analysis indicates a shift in consumer preferences towards sustainable 
products. I suggest we explore opportunities to integrate eco-friendly initiatives into our product 
line to stay ahead of the curve.
Emily: Agreed, Mike. Let's prioritize sustainability in our future endeavors. Before we adjourn, 
does anyone have any urgent matters to address?"""
print(summarize(SAVED_MODEL_TOK,SAVED_MODEL,text))
print(summarize(orignal_model_tok,orignal_model,text))

["Emily, Tom and Mike are reviewing the progress on the client project. The client has been positive so far. Lisa will be analyzing the financial data from the previous quarter and will send a proposed budget plan by the end of the day to everyone's review. "]
["Summarize dialogue >> _______________________________________________ _______________________________________________________________Emily: Good morning, team. Let's start by reviewing the progress on the client project. _______________________________________________________Tom, could you give us an update? _____________________________________________________________________________________________Tom: Certainly, Emily. We've completed the initial research phase and are now moving into the design stage. ___________________________________________________________________________________________________________________________Feedback from the client has been positive so far, and we're confident in meeting the project deadline"

In [4]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
#         print()
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
#         print(f"",)
#     print(example['dialogue'])
    generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:10.299947261810303
samples summarized:20	time:20.773779153823853
samples summarized:30	time:31.32620930671692
samples summarized:40	time:40.521833419799805
samples summarized:50	time:49.939515829086304
samples summarized:60	time:59.53868269920349
samples summarized:70	time:68.67994832992554
samples summarized:80	time:78.37402582168579
samples summarized:90	time:88.44464993476868
samples summarized:100	time:98.30354690551758
samples summarized:110	time:106.31132197380066
samples summarized:120	time:116.96791076660156
samples summarized:130	time:126.02568197250366
samples summarized:140	time:136.3091561794281
samples summarized:150	time:145.3128833770752
samples summarized:160	time:154.7295835018158
samples summarized:170	time:164.3201379776001
samples summarized:180	time:173.97152996063232
samples summarized:190	time:183.81937289237976
samples summarized:200	time:194.03785586357117
samples summarized:210	time:204.5013666152954
samples summarized:220	time:214.6

In [7]:
text = """Sarah: Hey Mark, how was your weekend?

Mark: It was great, thanks for asking. I went hiking with some friends. How about you?

Sarah: Mine was relaxing. I stayed home and binge-watched a new series on Netflix.

Mark: Nice! Which one?

Sarah: "The Crown." Have you seen it?

Mark: Yeah, I watched a couple of episodes. It's pretty good, but I couldn't get into it as much as I thought I would.

Sarah: Really? I find it fascinating, especially the portrayal of the royal family's dynamics.

Mark: Yeah, that part is interesting. I guess I prefer something with more action.

Sarah: Fair enough. Different strokes for different folks, right?

Mark: Exactly. So, any plans for next weekend?

Sarah: Not really, just catching up on some reading. How about you?

Mark: I might go camping if the weather's nice. It's been a while since I've been out in nature.

Sarah: That sounds like fun. Hopefully, the weather cooperates.

Mark: Yeah, fingers crossed."""
summarize(SAVED_MODEL_TOK,SAVED_MODEL,text),summarize(orignal_model_tok,orignal_model,text),

(['Mark spent the weekend hiking and watching "The Crown" on Netflix. He watched a couple of episodes and liked the portrayal of the royal family\'s dynamics. He might go camping next weekend if the weather is nice.'],
 ['Summarize dialogue >> Sarah: Hey Mark, how was your weekend? What was your favorite part of the weekend?Mark: It was great, thanks for asking. I went hiking with some friends. How about you? What were your favorite parts of the day?Sarah: Mine was relaxing. I stayed home and binge-watched a new series on Netflix. It\'s called "The Crown."Mark: Nice! Which one? What\'s your favorite thing about it?Sarah'])

In [9]:
import pandas as pd


temp_df = pd.DataFrame({'finetune_summary':generated_summaries,'original_summary':generated_summary_orignal,'human_summary':actual_summaries,'dialogue':dialogue_list})
temp_df.to_csv("results_bart_base_fullfinetune_10.csv")

In [14]:
import gradio as gr
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import emoji
from peft import PeftModel, PeftConfig

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading the dataset and tokenizer

SAVED_TOK_PATH = '/Users/sanjanajd/Desktop/Sanjana/NLP/Project/flant5-base-lora/tokenizer-emoji_t5'
tokenizer = AutoTokenizer.from_pretrained(SAVED_TOK_PATH)

# Configuring and loading the PEFT model
peft_model_id = "/Users/sanjanajd/Desktop/Sanjana/NLP/Project/flant5-base-lora/flan_t5_base_lora_finetune_emoji_save_adapter"
config = PeftConfig.from_pretrained(peft_model_id)
base_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path).to(device)
model = PeftModel.from_pretrained(base_model, peft_model_id).to(device)
model.resize_token_embeddings(len(tokenizer))

def summarize(tokenizer, model, text):
    text = emoji.demojize(text, delimiters=('<', '>'))
    inputs = tokenizer(f"Summarize dialogue >>\n {text}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    summary_ids = model.generate(inputs=inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    return summary[0]

def summarize_dialogue(text):
    return summarize(tokenizer, model, text)


In [16]:
iface = gr.Interface(
    fn=summarize_dialogue,
    inputs=gr.Textbox(lines=10, placeholder="Enter dialogue here...", label="Input Dialogue"),
    title="Flan t5 fine tuned using LoRA on SAMsum dataset",
    outputs=gr.Textbox(label="Generated Summary")
)

iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7871
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running on public URL: https://1f55ac58e23d9301ce.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


