In [None]:
!pip install -q transformers bitsandbytes datasets
!pip install evaluate rouge-score
!huggingface-cli login

## Loading Model

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
# base_model = "meta-llama/Llama-2-7b-chat-hf"
base_model = "abhi757/llama-2-7b-chat-paper-to-slides2"
# Reload tokenizer to save it

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Loading Llama 2 model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

## Prompt for Topic and Bullet point generation

In [14]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics and generating summaries.
<</SYS>>
"""
main_prompt = """
[INST]
I have a topic that contains the following documents:
{documents}

Based on the information about the topic above, you have two tasks.
Task-1: Please create a short label of this topic. Make sure you to only return the label and nothing more.
Task-2: Please create a short summmary of this topic describing the steps in the documents. Make sure that the you do not report more than six sentences in the list. Make sure to report the summary in a list of sentences. Make sure that each sentence does not exceed 10 words. Make sure to only return the list of sentences and nothing more.

Put this data into a JSON list with keys "label" and "summary".
[/INST] 
"""
prompt = system_prompt+main_prompt

# def add_prefix(example):
#     example["text"] = prompt.format(documents=example['documents'])
#     return example
# dataset = dataset.map(add_prefix)

In [4]:
from transformers import pipeline,logging
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,max_length=2048,temperature = 0.1)

## Loading Dataset and reformat

In [5]:
import xml.etree.ElementTree as ET
import glob, os
import json

def get_section_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  sections = []

  for neighbor in root.findall('.//{http://www.tei-c.org/ns/1.0}div'):
    i=0
    section_temp = {}
    content=[]
    for head in neighbor:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        section_temp['title'] = head.text
        # print("i = ",i)
      else:
        content.append(head.text)
      i+=1
    section_temp['content'] = content
    if content!=[] and section_temp['title']!=None:
      sections.append(section_temp)
  return sections

def get_slide_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  slides = []
  slide_count=0

  for slide in root.findall('div'):
    slide_count+=1
    slide_temp = {}
    slide_temp['id'] = slide_count

    i=0
    content=[]
    for head in slide:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        slide_temp['title'] = head.text
      else:
        content.append(head.text)
      i+=1
    slide_temp['content'] = content
    if(content):
      slides.append(slide_temp)

  return slides

import numpy as np

def reformat_sections_for_llm(sections):
  para_format  = "title: {title}, content: {content}"
  for section in sections:
    for i in range(len(section['content'])):
      section['content'][i]=para_format.format(title=section['title'],content=section['content'][i])
  return sections

def get_section_partition_indices(token_lens,token_thresh=400):
  token_lens = np.cumsum(token_lens)
  partition_ids = []
  for i in range(len(token_lens)-1):
    if (abs(token_lens[i]-token_thresh) < abs(token_lens[i+1]-token_thresh)):
      partition_ids.append(i) 
      token_lens-=token_lens[i]
  partition_ids.append(len(token_lens)-1)
  return partition_ids
    
def partition_section(section_content,token_thresh = 400):
  token_lens = [len(tokenizer.tokenize(para)) for para in section_content]
  partition_ids = get_section_partition_indices(token_lens,token_thresh)
  start_id = 0
  documents = []
  for id in partition_ids:
    documents.append(section_content[start_id:id+1])
    start_id = id+1
  return documents

def gen_documents_for_slides(sections):
  documents=[]
  max_token_thresh = 500
  token_min_thresh = 100
  for section in sections:
    num_tokens = len(tokenizer.tokenize("".join(section['content'])))
    if num_tokens<max_token_thresh and num_tokens > token_min_thresh:
      documents.append(section['content'])
      # Remember to append section title
    elif num_tokens>max_token_thresh:
      section_documents = partition_section(section['content'])
      for docs in section_documents:
        if len(tokenizer.tokenize("".join(docs)))>token_min_thresh:
          documents.append(docs)
  return documents


In [7]:
def get_prediction(pipe,slide_base_documents,prompt):
    prompt_modified = [prompt.format(documents=docs) for docs in slide_base_documents]
    output = pipe(prompt_modified)    
    result = [output[ind][0]['generated_text'].replace(prompt_modified[ind],"") for ind in range(len(output))]
    return result

def convert_to_json(result):
    result_json=[]
    i=0
    for res in result:
        try:
            result_json.append(json.loads(res.replace('\n',"")))
        except:
            print("Slide No. ", i, 'not in correct format')
        i+=1
    return result_json

def evaluate_metric(prediction, target,metric):
    """
    prediction : List of json objects with keys={'label','summary'}
    target: List of dictionaries with keys = {'id','title','content'}
    """
    # Concatenate all predicted slides into a single text
    slide_all_predicted_list=[]
    for res in prediction:
        slide_all_predicted_list.append(res['label'])
        slide_all_predicted_list.append(" ".join(res['summary']))

    slide_all_predicted = " ".join(slide_all_predicted_list)
    
    # Concatenate all target slides into a single text
    slide_all_target_list=[]
    for i in range(1,len(target)):
        slide_all_target_list.append(target[i]['title'])
        slide_all_target_list.append(" ".join(target[i]['content']))

    slide_all_target = " ".join(slide_all_target_list)
    results = metric.compute(predictions=[slide_all_predicted], references=[slide_all_target])
    
    return results

def save_generated_slides(paper_index,predicted_slides):
    save_path = '/kaggle/working/fine-tune-prediction/'+str(paper_index)+'.json'
    with open(save_path, 'w') as file:
        for sample in predicted_slides:
          json.dump(sample, file)
          file.write('\n')

In [None]:
import evaluate
rouge = evaluate.load('rouge')
rouge_score_list = []
prediction_list=[]
for paper_index in range(80,100):
    # paper_index = 82
    dir_name = '/kaggle/input/pdf2slides-dataset-100/dataset_0_99-20231206T111845Z-001/dataset_0_99/'+str(paper_index)+'/'
    print(dir_name)
    prefix = 'Paper'
    file_paper = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

    prefix = 'slide'
    file_slide = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

    sections = get_section_details(file_paper)
    slides = get_slide_details(file_slide)

    sections = reformat_sections_for_llm(sections)
    slide_base_documents = gen_documents_for_slides(sections)

    predicted_slides = get_prediction(pipe,slide_base_documents,prompt)
    prediction_list.append(predicted_slides)
    predicted_slides_json = convert_to_json(predicted_slides)
    save_generated_slides(paper_index,predicted_slides_json)

    rouge_score = evaluate_metric(predicted_slides_json, slides,rouge)
    rouge_score_list.append(rouge_score)
    print(rouge_score)
