In [1]:
!pip install -q transformers einops accelerate langchain bitsandbytes peft trl
!pip install InstructorEmbedding sentence-transformers

Installing collected packages: sentencepiece, InstructorEmbedding, sentence-transformers
Successfully installed InstructorEmbedding-1.0.1 sentence-transformers-2.2.2 sentencepiece-0.1.99


In [2]:
!huggingface-cli login

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from InstructorEmbedding import INSTRUCTOR
model_embedding = INSTRUCTOR('hkunlp/instructor-large')

  from tqdm.autonotebook import trange


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

base_model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
import xml.etree.ElementTree as ET
import glob, os
import json

def get_section_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  sections = []

  for neighbor in root.findall('.//{http://www.tei-c.org/ns/1.0}div'):
    i=0
    section_temp = {}
    content=[]
    for head in neighbor:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        section_temp['title'] = head.text
        # print("i = ",i)
      else:
        content.append(head.text)
      i+=1
    section_temp['content'] = content
    if content!=[] and section_temp['title']!=None:
      sections.append(section_temp)
  return sections

def get_slide_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  slides = []
  slide_count=0

  for slide in root.findall('div'):
    slide_count+=1
    slide_temp = {}
    slide_temp['id'] = slide_count

    i=0
    content=[]
    for head in slide:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        slide_temp['title'] = head.text
      else:
        content.append(head.text)
      i+=1
    slide_temp['content'] = content
    if(content):
      slides.append(slide_temp)

  return slides

import numpy as np

def reformat_sections_for_llm(sections):
  para_format  = "title: {title}, content: {content}"
  for section in sections:
    for i in range(len(section['content'])):
      section['content'][i]=para_format.format(title=section['title'],content=section['content'][i])
  return sections

def get_section_partition_indices(token_lens,token_thresh=400):
  token_lens = np.cumsum(token_lens)
  partition_ids = []
  for i in range(len(token_lens)-1):
    if (abs(token_lens[i]-token_thresh) < abs(token_lens[i+1]-token_thresh)):
      partition_ids.append(i)
      token_lens-=token_lens[i]
  partition_ids.append(len(token_lens)-1)
  return partition_ids

def partition_section(section_content,token_thresh = 400):
  token_lens = [len(tokenizer.tokenize(para)) for para in section_content]
  partition_ids = get_section_partition_indices(token_lens,token_thresh)
  start_id = 0
  documents = []
  for id in partition_ids:
    documents.append(section_content[start_id:id+1])
    start_id = id+1
  return documents

def gen_documents_for_slides(sections):
  documents=[]
  max_token_thresh = 500
  token_min_thresh = 100
  for section in sections:
    num_tokens = len(tokenizer.tokenize("".join(section['content'])))
    if num_tokens<max_token_thresh and num_tokens > token_min_thresh:
      documents.append(section['content'])
      # Remember to append section title
    elif num_tokens>max_token_thresh:
      section_documents = partition_section(section['content'])
      for docs in section_documents:
        if len(tokenizer.tokenize("".join(docs)))>token_min_thresh:
          documents.append(docs)
  return documents



In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def get_query(caption):
  query = []
  query_instruction = 'Represent the scientific texts for retrieving supporting documents:'
  query.append(query_instruction)
  query.append(caption)
  return query

def get_corpus(slides):
  corpus = []
  corpus_instruction = 'Represent the scientific document for retrieval'
  for slide in slides:
    corpus_temp = []
    corpus_temp.append(corpus_instruction)
    corpus_temp.append(" ".join(slide))
    corpus.append(corpus_temp)
  return corpus

In [12]:
dir_name = '/content/drive/MyDrive/paper2slides/97'
print(dir_name)
prefix = 'Paper'
file_paper = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

prefix = 'slide'
file_slide = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

sections = get_section_details(file_paper)
slides = get_slide_details(file_slide)

sections = reformat_sections_for_llm(sections)
slide_base_documents = gen_documents_for_slides(sections)

print('No. of slides: ', len(slide_base_documents))

/content/drive/MyDrive/paper2slides/97
No. of slides:  34


In [8]:
import json
fig_file_name = '/content/drive/MyDrive/paper2slides/97/fig_captions.json'

fig_captions = json.load(open(fig_file_name,'r'))
captions = list(fig_captions.values())


In [10]:
corpus = get_corpus(slide_base_documents)


In [13]:

for caption in captions:
  query = get_query(caption)
  query_embeddings = model_embedding.encode([query])
  corpus_embeddings = model_embedding.encode(corpus)
  similarities = cosine_similarity(query_embeddings,corpus_embeddings)
  retrieved_doc_id = np.argmax(similarities)
  sim = similarities[0][retrieved_doc_id]
  if (sim>0.9):
    print('Caption: ', caption)
    print('Slide no. ', retrieved_doc_id,' Similarity score: ',sim)

Caption:  Figure 1: A visual depiction of multiple memory protection domains within a single shared address space.
Slide no.  0  Similarity score:  0.90422034
Caption:  Figure 4: How an address indexes the multi-level permissions table (MLPT).
Slide no.  6  Similarity score:  0.90970373
Caption:  Figure 6: A MLPT entry consisting of a permissions vector. User segments are broken up into individual word permissions. Type (1)
Slide no.  6  Similarity score:  0.93691117
Caption:  Figure 8: The bit allocation for a mini-SST permission table entry.
Slide no.  7  Similarity score:  0.915568
Caption:  Figure 11: Using memory protection and segment translation to implement zero-copy networking. The network interface card DMAs packets into the kernel. The kernel exports the packets to an untrusted client by creating segments for the payload of the packets. Segment translation is used to present the illusion to the client that the packet payloads are contiguous in memory at 0x1000-0x12FF.
Slide 

In [15]:
gen_slide_path = '/content/drive/MyDrive/paper2slides/prediction/97.json'

slides =[]
with open(gen_slide_path, 'r') as file:
    for line in file:
        slides.append(json.loads(line))

In [31]:

slide_index = 7
print(slides[slide_index]['label'])
for line in slides[slide_index]['summary']:
  print(line)

Mini-SST Entries
Create efficient MLPT using mini-SST entries as primary type.
Mini-SST entries can contain overlapping address ranges.
When entry owned by one range is changed, other entries that overlap must be updated.
Entries overlapping modified user segment must be flushed from PLB.
Can design efficient MLPT using mini-SST entries.
Mini-SST format reserves top two bits for entry type tag.


dict_keys(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])

array([0.3261302 , 0.08254741, 0.10961397, 0.10961397])