In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes peft trl
!pip install InstructorEmbedding sentence-transformers
!huggingface-cli login

In [None]:
from InstructorEmbedding import INSTRUCTOR
model_embedding = INSTRUCTOR('hkunlp/instructor-large')

  from tqdm.autonotebook import trange


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
# Reading dataset from xml files
# Generating dictionaries of sections and slides

import xml.etree.ElementTree as ET
import glob, os

def get_section_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  sections = []

  for neighbor in root.findall('.//{http://www.tei-c.org/ns/1.0}div'):
    i=0
    section_temp = {}
    content=[]
    for head in neighbor:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        section_temp['title'] = head.text
        # print("i = ",i)
      else:
        content.append(head.text)
      i+=1
    section_temp['content'] = content
    if content!=[] and section_temp['title']!=None:
      sections.append(section_temp)
  return sections

def get_slide_details(file):
  tree = ET.parse(file[0])
  root = tree.getroot()

  slides = []
  slide_count=0

  for slide in root.findall('div'):
    slide_count+=1
    slide_temp = {}
    slide_temp['id'] = slide_count

    i=0
    content=[]
    for head in slide:
      if (i==0):
        # print('Section Title: ', head.text,"\n")
        slide_temp['title'] = head.text
      else:
        content.append(head.text)
      i+=1
    slide_temp['content'] = content
    if(content):
      slides.append(slide_temp)

  return slides



In [None]:
# Information Retrieval for Dataset Generation

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json

def gen_query_for_slide(slides,slide_index=0):
  # slide_index = 7
  query = []
  query_instruction = 'Represent the scientific texts for retrieving supporting documents:'
  query.append(query_instruction)
  slide_content = ' '.join(slides[slide_index]['content'])
  slide_title = 'title: '+ slides[slide_index]['title']+ ', content: '
  query.append(slide_title+slide_content)

  return query

# Section as one document of the corpus
def gen_section_corpus(sections):
  corpus = []
  corpus_instruction = 'Represent the scientific document for retrieval'
  for section in sections:
    corpus_temp = []
    corpus_temp.append(corpus_instruction)
    all_paragraph = ' '.join(section['content'])
    section_title = 'title: '+ section['title']+', content: '
    corpus_temp.append(section_title+all_paragraph)
    corpus.append(corpus_temp)
  return corpus

def gen_paragraph_corpus(sections):
  corpus = []
  corpus_instruction = 'Represent the scientific document for retrieval'
  for section in sections:
    for paragraph in section['content']:
      corpus_temp = []
      corpus_temp.append(corpus_instruction)
      section_title = 'title: '+ section['title']+', content: '
      corpus_temp.append(section_title+paragraph)
    # print(len(all_paragraph.split()))
      corpus.append(corpus_temp)
  return corpus


def retrieve_section_id(model,corpus,slides,slide_index,corpus_type='section'):
  query = gen_query_for_slide(slides,slide_index)
  query_embeddings = model.encode([query])
  corpus_embeddings = model.encode(corpus)
  similarities = cosine_similarity(query_embeddings,corpus_embeddings)
  # print(similarities)
  # top_k = 3
  # ind = np.argpartition(similarities, -top_k)[-top_k:]
  # print('top_k = ',ind)
  retrieved_doc_id = np.argmax(similarities)
  if corpus_type=='section':
    return retrieved_doc_id, similarities[0][retrieved_doc_id]
  elif corpus_type=='paragraph':
    return retrieved_doc_id,similarities[0]

def get_section_to_slide_map(model,sections,slides,sim_thresh=0.9):
  map_slide_index = []
  map_section_index = []
  corpus = gen_section_corpus(sections)
  for slide_index in range(1,len(slides)):
    section_index, sim_score = retrieve_section_id(model,corpus,slides,slide_index)
    if (sim_score>sim_thresh):
      map_slide_index.append(slide_index)
      map_section_index.append(section_index)

  # Map dict has section indices as keys and for every key,
  # the list of corresponding slide indices
  # are saved as values
  map_dict = {k: None for k in sorted(map_section_index)}
  for k in map_dict.keys():
    ind = np.argwhere(map_section_index==k)
    slide_indices_for_k = [map_slide_index[id[0]] for id in ind]
    map_dict[k] = slide_indices_for_k
  return map_dict

def get_section_slide_mapping_text(sections,slides,map_dict):
  '''
  Generate a list of dictionaries
  A dictionary correspond to one section of the paper
  It contains the following information
  1. Section id
  2. Section title
  3. Section content
  4. List of slide titles for this section
  5. List of contents of the slides for this section

  Store this dictionary as json file. There will be one json file corresponding to one paper.
  '''
  section_dict_list = []
  for section_id in map_dict.keys():
    # section_id= 1
    section_dict = {}
    section_dict['section_id'] = str(section_id)
    section_dict['section_title'] = sections[section_id]['title']
    section_dict['section_content'] = sections[section_id]['content']

    slide_indices = map_dict[section_id]

    slide_titles=[]
    slide_contents = []
    for slide_index in slide_indices:
      slide_titles.append(slides[slide_index]['title'])
      slide_contents.append(slides[slide_index]['content'])
    section_dict['slide_titles']=slide_titles
    section_dict['slide_contents']=slide_contents
    section_dict_list.append(section_dict)
  return section_dict_list


### # Pre-process paper text based on sections

In [None]:



for paper_index in range(18,100):
  # paper_index = 6

  dir_name = '/content/drive/MyDrive/dataset_0_99/'+str(paper_index)+'/'
  print(dir_name)
  prefix = 'Paper'
  file_paper = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

  prefix = 'slide'
  file_slide = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

  sections = get_section_details(file_paper)
  slides = get_slide_details(file_slide)

  # Retrieving section to corresponding slide matching
  sim_thresh = 0.91

  map_dict = get_section_to_slide_map(model_embedding,sections,slides,sim_thresh)

  section_dict_list = get_section_slide_mapping_text(sections,slides,map_dict)

  fileName = dir_name+'section_slide_map.json'
  with open(fileName, 'w') as file:
      json.dump(section_dict_list, file)


## Generating top-k paragraphs for a slide and save to disc

In [None]:
# Pre-process paper text based on paragraphs
corpus_type = 'paragraph'
top_k = 3
for paper_index in range(61,100):
  paper_list=[]
  dir_name = '/content/drive/MyDrive/dataset_0_99/'+str(paper_index)+'/'
  print(dir_name)
  prefix = 'Paper'
  file_paper = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

  prefix = 'slide'
  file_slide = glob.glob(os.path.join(dir_name, prefix + '*.xml'))

  sections = get_section_details(file_paper)
  slides = get_slide_details(file_slide)
  corpus = gen_paragraph_corpus(sections)

  for slide_index in range(1,len(slides)):
    slide_dict = {}
    sec_id,sim = retrieve_section_id(model_embedding,corpus,slides,slide_index,corpus_type)

    ind = np.argpartition(sim, -top_k)[-top_k:]
    ind = ind[np.argsort(sim[ind])]
    ind = ind[::-1]
    Documents = []
    for index in ind:
      Documents.append(corpus[index][1])

    slide_dict['title'] = slides[slide_index]['title']
    slide_dict['content'] = slides[slide_index]['content']
    slide_dict['documents']=Documents
    slide_dict['sim_scores'] = str(sim[ind])
    slide_dict['document_index']=str(ind)

    paper_list.append(slide_dict)

  fileName = dir_name+'slide_document_map.json'
  with open(fileName, 'w') as file:
    json.dump(paper_list, file)


/content/drive/MyDrive/dataset_0_99/61/


## Combine dataset for training

In [None]:
import glob,json,os

data = []

for paper_index in range(60):
  prefix = "slide_document_map.json"
  dir_name = '/content/drive/MyDrive/dataset_0_99/'+str(paper_index)+'/'
  # print(dir_name)
  file_json = glob.glob(os.path.join(dir_name, prefix))
  file = open(file_json[0],'r')

  section_slide_map = json.load(file)

  for item in section_slide_map:
    data_dict={}
    data_dict['title'] = item['title']

    # Discard slides with longer titles
    if len(item['title'].split())<10:
      data_dict['summary']=item['content']
      # Thresholding based on similarity scores
      sim_scores = [eval(num) for num in item['sim_scores'].replace("[","").replace("]","").split()]

      score_thresh = 0.90
      data_dict['documents']=[]
      score_flag = False
      for ind in range(len(sim_scores)):
        if (sim_scores[ind]>=score_thresh):
          score_flag=True
          data_dict['documents'].append(item['documents'][ind])
      if score_flag:
        data.append(data_dict)


In [None]:
save_dir_name = '/content/drive/MyDrive/dataset_0_99/finetune_dataset.json'

with open(save_dir_name, 'w') as file:
    for sample in data:
      json.dump(sample, file)
      file.write('\n')