In [1]:
!pip install langchain -q
!pip install faiss-gpu -q
!pip install adapter-transformers -q
!pip install openai -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# Params

The researcher will write a survey about some topic. In this pipeline, we consider this topic as the query for a database of scientific papers. In this implementation, the Semantic Scholar API is used to access this database.

He/she must also provide search criteria that will be used to narrow their search. For example, he/she migth be interested only in papers from a certain date.

In [111]:
####################################
# INPUT PARAMS DEFINED BY THE USER #
####################################
DEBUG = True
SURVEY_TOPIC_QUERY = 'text neural information retrieval'

SURVEY_FILTERS = {
  "fields_of_study": 'Computer Science',
  "year": '2020-2023',
  "only_open_access": True,
  "max_number_of_papers": 9999, # should be less than 10000
}

SURVEY_STRUCTURE = {
  "n_sections": [5, 3], # Recursive -> first level is 5 sections, second level is 3 sections. Arbitrary levels are allows
  "n_papers_to_suggest_title_section": 10, # top_k1 in paper
  "max_papers_per_section_as_ref": 20,     # top_k2 in paper
  "n_papers_to_cluster_in_subsections": 50, # top_k3 in paper, should be less than the total number of papers in the first level
}

In [44]:
from getpass import getpass

##########################################
# INPUT PARAMS DEFINED BY THE PROGRAMMER #
##########################################
# Batch size to extract the embeddings of the papers
PARAMS = {
  'batch_size': 32,
  'OPENAI_API_KEY': '',
  'SEMANTIC_SCHOLAR_API': '',
  'gpt_model_name': 'gpt-3.5-turbo-0613'
}

PARAMS['OPENAI_API_KEY'] = getpass('OpenAI API key: ')
PARAMS['SEMANTIC_SCHOLAR_API'] = getpass('Semantic Scholar API key: ')

OpenAI API key: ··········
Semantic Scholar API key: ··········


# Structure of a survey

In [46]:
# The sections list has the following structure:
#sections = [
#    {
#        'section_title': 'TITILE 1',
#        'papers': [],
#        'subsections': [
#            {
#                'section_title': 'SUBTITLE 1.1',
#                'papers': []
#            },
#            ...
#        ]
#    },
#    ...
#]

In [45]:
# Create empty sections structure:
def create_empty_sections(number_of_sections):
  secs = []
  for i in range(number_of_sections):
    new_section = {"section_title": "", "papers": [], "subsections": []}
    secs.append(new_section)
  return secs

def populate_with_empty_sections(list_sections, n_sections_in_each_level):
  if len(n_sections_in_each_level) == 0:
    return

  sections_in_level = create_empty_sections(n_sections_in_each_level[0])
  list_sections.extend(sections_in_level)

  for sec_in_level in sections_in_level:
    populate_with_empty_sections(sec_in_level['subsections'], n_sections_in_each_level[1:])

sections = []
populate_with_empty_sections(sections, SURVEY_STRUCTURE['n_sections'])

# Initial search - Get survey papers

A partir dessa entrada, fazemos uma busca inicial de artigos usando a API do Semantic Scholar:

In [47]:
import requests
import json
import pickle
import time

def search_by_keywords(query,
                       fields='url,title,venue,year,authors,abstract,openAccessPdf,citationCount,referenceCount,publicationTypes,journal,tldr,publicationDate',
                       fieldsOfStudy='Computer Science',
                       year='2020-2023',
                       openAccessPdf=True,
                       offset=0,
                       limit=100): #limit should be <= 100
  query_openaccess = '&openAccessPdf' if openAccessPdf else ''
  url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}&fields={fields}&fieldsOfStudy={fieldsOfStudy}&year={year}{query_openaccess}&offset={offset}&limit={limit}'
  headers = {"x-api-key": PARAMS['SEMANTIC_SCHOLAR_API']} if PARAMS['SEMANTIC_SCHOLAR_API'] else {}
  time.sleep(1)
  return requests.get(url, headers=headers).json()


def search_all_by_keywords(query="", fieldsOfStudy="Computer Science", year='2020-2023', openAccessPdf=True, maxNumberOfPapers=1000):
  offset = 0
  limit = min(100, maxNumberOfPapers)
  total = 1
  all_papers = []

  while True:
    result = search_by_keywords(query, fieldsOfStudy=fieldsOfStudy, year=year, openAccessPdf=openAccessPdf, offset=offset, limit=limit)
    all_papers.extend(result['data'])
    total = result['total']

    if (DEBUG):
      print(f'Searching from {offset} to {offset+limit}. Total: {total}')

    offset += limit

    if offset > total or offset >= maxNumberOfPapers:
      break

    if (offset + limit) >= maxNumberOfPapers:
      limit = maxNumberOfPapers - offset

  return all_papers


def save_file(obj, file_name):
  with open(file_name, 'wb') as f:
    pickle.dump(obj, f)

In [112]:
%%time
# Check total
query=SURVEY_TOPIC_QUERY
print(query)
print(search_by_keywords(query=query,
                   fieldsOfStudy=SURVEY_FILTERS['fields_of_study'],
                    year=SURVEY_FILTERS['year'],
                    openAccessPdf=SURVEY_FILTERS['only_open_access'],
                    offset=0, limit=10))


text neural information retrieval
{'total': 3241, 'offset': 0, 'next': 10, 'data': [{'paperId': '0385e1986160069417c3fa497b08441c45ad150a', 'url': 'https://www.semanticscholar.org/paper/0385e1986160069417c3fa497b08441c45ad150a', 'title': 'Continual Learning of Long Topic Sequences in Neural Information Retrieval - abstract', 'abstract': None, 'venue': 'Joint Conference of the Information Retrieval Communities in Europe', 'year': 2022, 'referenceCount': 44, 'citationCount': 2, 'openAccessPdf': {'url': 'http://arxiv.org/pdf/2201.03356', 'status': 'GREEN'}, 'tldr': {'model': 'tldr@v2.0.0', 'text': 'This paper proposes a dataset based upon the MSMarco corpus aiming at modeling a long stream of topics as well as IR property-driven controlled settings and in-depth analyzes the ability of recent neural IR models while continually learning those streams.'}, 'publicationTypes': ['JournalArticle'], 'publicationDate': '2022-01-10', 'journal': {'name': 'ArXiv', 'volume': 'abs/2201.03356'}, 'author

In [113]:
%%time

def get_survey_papers(query=SURVEY_TOPIC_QUERY, survey_filters=SURVEY_FILTERS):
  return search_all_by_keywords(query=query,
                                fieldsOfStudy=survey_filters['fields_of_study'],
                                year=survey_filters['year'],
                                openAccessPdf=survey_filters['only_open_access'],
                                maxNumberOfPapers=survey_filters['max_number_of_papers'])

all_survey_papers = get_survey_papers(query=SURVEY_TOPIC_QUERY, survey_filters=SURVEY_FILTERS)

Searching from 0 to 100. Total: 3241
Searching from 100 to 200. Total: 3241
Searching from 200 to 300. Total: 3241
Searching from 300 to 400. Total: 3241
Searching from 400 to 500. Total: 3241
Searching from 500 to 600. Total: 3241
Searching from 600 to 700. Total: 3241
Searching from 700 to 800. Total: 3241
Searching from 800 to 900. Total: 3241
Searching from 900 to 1000. Total: 3241
Searching from 1000 to 1100. Total: 3229
Searching from 1100 to 1200. Total: 3229
Searching from 1200 to 1300. Total: 3229
Searching from 1300 to 1400. Total: 3229
Searching from 1400 to 1500. Total: 3229
Searching from 1500 to 1600. Total: 3229
Searching from 1600 to 1700. Total: 3229
Searching from 1700 to 1800. Total: 3229
Searching from 1800 to 1900. Total: 3229
Searching from 1900 to 2000. Total: 3229
Searching from 2000 to 2100. Total: 3229
Searching from 2100 to 2200. Total: 3229
Searching from 2200 to 2300. Total: 3229
Searching from 2300 to 2400. Total: 3229
Searching from 2400 to 2500. Total: 3

# Extract embeddings of survey papers

Define a class to extract the embeddings using Specter

In [114]:
# Based on https://github.com/hwchase17/langchain/blob/4379bd4cbb8482e70d8936f747abd5ae7663f977/langchain/embeddings/huggingface.py#L16

from torch import cuda, bfloat16
from pydantic import BaseModel, Extra, Field
from langchain.embeddings.base import Embeddings
from typing import Any, Dict, List, Optional
import torch
from transformers import AutoAdapterModel, AutoTokenizer, AutoModel
import numpy as np
from tqdm.auto import tqdm
#import transformers

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

class SpecterEmbeddings(BaseModel, Embeddings):

  """Key word arguments to pass to the model."""
  encode_kwargs: Dict[str, Any] = Field(default_factory=dict)

  def __init__(self, **kwargs: Any):
    super().__init__(**kwargs)

    self.tokenizer = AutoTokenizer.from_pretrained('allenai/specter2')
    self.model = AutoModel.from_pretrained('allenai/specter2')

    self.model.load_adapter("allenai/specter2_proximity", source="hf", load_as="specter2_proximity", set_active=False)
    self.model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="adhoc_query", set_active=False)

    self.device = device
    self.model.eval()
    self.model.to(self.device)

  @torch.no_grad()
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
    """Compute doc embeddings using a HuggingFace transformer model.

    Args:
        texts: The list of texts to embed.

    Returns:
        List of embeddings, one for each text.
    """

    self.model.set_active_adapters(None)
    self.model.set_active_adapters("specter2_proximity")

    all_embeddings = []

    batch_size = 32
    show_progress_bar = True

    if 'batch_size' in self.encode_kwargs:
      batch_size = self.encode_kwargs['batch_size']
    if 'show_progress_bar' in self.encode_kwargs:
      show_progress_bar = self.encode_kwargs['show_progress_bar']

    # sort text for less padding
    length_sorted_idx = np.argsort([-len(sen) for sen in texts])
    texts_sorted = [texts[idx] for idx in length_sorted_idx]

    for start_index in tqdm(range(0, len(texts_sorted), batch_size), desc="Batches", disable=not show_progress_bar):
      texts_batch = texts_sorted[start_index:start_index+batch_size]

      inputs = self.tokenizer(texts_batch, padding=True, truncation=True,
                              return_tensors="pt", return_token_type_ids=False, max_length=512)

      output = self.model(**inputs.to(self.device))
      # take the first token in the batch as the embedding
      embeddings = output.last_hidden_state[:, 0, :]

      all_embeddings.extend(embeddings.tolist())

    return all_embeddings

  class Config:
      """Configuration for this pydantic object."""

      extra = Extra.allow

  @torch.no_grad()
  def embed_query(self, text: str) -> List[float]:
    """Compute query embeddings using a HuggingFace transformer model.

    Args:
        text: The text to embed.

    Returns:
        Embeddings for the text.
    """
    self.model.set_active_adapters(None)
    self.model.set_active_adapters("adhoc_query")

    inputs = self.tokenizer(text, padding=True, truncation=True,
                            return_tensors="pt", return_token_type_ids=False, max_length=512)

    output = self.model(**inputs.to(self.device))
    # take the first token in the batch as the embedding
    embeddings = output.last_hidden_state[:, 0, :]

    return embeddings.squeeze(0).tolist()

In [115]:
batch_size_embeddings = 1 if device == 'cpu' else PARAMS['batch_size']

embeddings = SpecterEmbeddings(encode_kwargs={'batch_size': batch_size_embeddings})

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Define a list of Document containing the page_content and metadata of all survey papers. The metadata is the paper structured as returned by the Semantic Scholar.

In [116]:
from langchain.schema import Document

all_documents = []
all_documents_str = []
for paper in all_survey_papers:
  title = paper['title']
  abstract = paper['abstract']
  # Replace [SEP] to {sep} and None to ''
  title = title.replace('[SEP]', '{sep}') if title is not None else ''
  abstract = abstract.replace('[SEP]', '{sep}') if abstract is not None else ''

  contents = f'{title} [SEP] {abstract}'

  all_documents_str.append(contents)
  all_documents.append(Document(page_content=contents, metadata=paper))

In [117]:
%%time

all_documents_embeddings = embeddings.embed_documents(all_documents_str)

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

CPU times: user 1min 2s, sys: 180 ms, total: 1min 2s
Wall time: 1min 1s


In [118]:
%time
from langchain.vectorstores import FAISS

vector_db_all_documents = FAISS.from_documents(all_documents, embeddings)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

# Create sections

Method to split a list of documents embeddings in clusters:

In [119]:
from sklearn.cluster import SpectralClustering, KMeans
import numpy as np
from collections import defaultdict

def split_documents_into_clusters(document_embeddings, n_clusters, random_state=42):
  # Create model
  kmeans_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')

  # Fit
  kmeans_model.fit(document_embeddings)

  # Cluster labels
  labels = kmeans_model.labels_

  # Centroids
  cluster_centers = kmeans_model.cluster_centers_
  distance_to_center = np.linalg.norm(document_embeddings - cluster_centers[labels], axis=1)

  # Split documents into clusters. And desc-sort using the distance to the centroid of the cluster
  index_docs_in_cluster = defaultdict(list)
  distances_docs_in_cluster = defaultdict(list)
  for idx_doc, label in enumerate(labels):
    index_docs_in_cluster[label].append(idx_doc)
    distances_docs_in_cluster[label].append(distance_to_center[idx_doc])

  # Sort
  for cluster in index_docs_in_cluster:
    index_docs_in_cluster[cluster] = [idx_doc for _, idx_doc in sorted(zip(distances_docs_in_cluster[cluster], index_docs_in_cluster[cluster]), reverse=True)]
    if DEBUG:
      print(f"Cluster {cluster}: {len(index_docs_in_cluster[cluster])}")

  return index_docs_in_cluster

Method to suggest the name of a section given a list of papers:

In [120]:
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

# SYSTEM MESSAGE
system_template = "You are a renowned scientist who is writing a survey on '{survey_topic}'. You are currently writing a section about '{query_section}'"
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)

# FIRST HUMAN MESSAGE - EXPLAINING THE TASK
human_template_task = """\
I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section '{query_section}'.\n\
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.\n\
You should also explain your reasoning.\n\
Your answer should be a valid RFC8259 compliant JSON object with three properties:\n\
The first property, called "topic", describes the main topic that is a subset of '{query_section}' that are present in most abstracts.\n\
The second property, called "title", is the title of the section that will cover this topic and must be clearly related to the property "topic".\n\
The third property is called "reasoning" and should contains your reasoning to choose this topic as an answer.\n\
Use this format: \
\n\
{{\n\
  "topic": {{TOPIC}},\n\
  "title": {{TITLE}},\n\
  "reasoning": {{REASONING}}\n\
}}\n\

Do you understand?
"""
#The first property, called "topic", describes the topic and MUST be a subset of '{query_section}'. \
#The second property, called "title", is the title of the section that will cover this topic and must be clearly related to the property "topic". \
#The last property is called "reasoning" and should contains your reasoning to choose this topic as an answer. \

human_message_prompt_task = HumanMessagePromptTemplate.from_template(human_template_task)

# FIRST AI ANSWER - AGREEING
ai_message_prompt_yes = AIMessagePromptTemplate.from_template('Sure, send me the list and I will give you what you need.')

# SECOND HUMAN MESSAGE - ABSTRACT AND TITLE
# This is a variable message that depends on the papers in the section. Papers has the structured returned by the semantic scholar
def text_message_human_prompt_papers_in_section(papers):
  message = ''
  for paper in papers:
    if (paper['abstract'] is None or paper['title'] is None):
      continue
    # Trunc first 3000 letters of the abstract
    message = message + f"Title: {paper['title']}\nAbstract: {paper['abstract'][0:3000]}\n\n"

  # Encode "{" and "}" from the message
  message = message.replace("{", "{{")
  message = message.replace("}", "}}")
  return HumanMessagePromptTemplate.from_template(message)

In [121]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
import ast

# papers is a list of Document
def suggest_title_for_section(papers, query_section):
  llm_gpt = ChatOpenAI(temperature=0, openai_api_key=PARAMS['OPENAI_API_KEY'], model_name=PARAMS['gpt_model_name'])

  message_few_shot_prompt = text_message_human_prompt_papers_in_section(papers)

  # Now, generate the chat messages for this subsection
  chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt,
                                                  human_message_prompt_task,
                                                  ai_message_prompt_yes,
                                                  message_few_shot_prompt])

  question_chain = LLMChain(llm=llm_gpt, prompt=chat_prompt, verbose=True)
  #chat_messages = chat_prompt.format_prompt(survey_topic=SURVEY_TOPIC_QUERY, query_section=query_section).to_messages()
  result_gpt = question_chain.run(survey_topic=SURVEY_TOPIC_QUERY, query_section=query_section)

  if (DEBUG):
    print(result_gpt)

  open_curly_brackets = result_gpt.find('{')
  close_curly_brackets = result_gpt.rfind('}')+1
  json_str = result_gpt[open_curly_brackets:close_curly_brackets]
  json_str = json_str.replace('true,', 'True,').replace('false,', 'False,')
  return ast.literal_eval(json_str)

The first level of sections use SURVEY_FILTERS['max_number_of_papers'] to define the clusters

The other levels use SURVEY_STRUCTURE['n_papers_to_cluster_in_subsections'] to define the clusters. In this case, we search for the super section title in a database containing SURVEY_FILTERS['max_number_of_papers'] papers and retrieve only the top SURVEY_STRUCTURE['n_papers_to_cluster_in_subsections'] papers to define the cluster.

In [122]:
def populate_sections_with_suggested_titles(sections=sections,
                                            super_sections_docs=all_documents,
                                            super_sections_docs_embeddings=all_documents_embeddings,
                                            query_super_section=SURVEY_TOPIC_QUERY,
                                            random_state=42):
  # Get the number of sections in this level
  n_sec_in_level = len(sections)
  # Split the documents into {n_sec_in_level} clusters
  index_docs_in_section = split_documents_into_clusters(super_sections_docs_embeddings, n_sec_in_level, random_state)

  if (DEBUG):
    print(f"Extracting {n_sec_in_level} sections in this level:")

  # Get a suggestion title for each section
  for i_section in range(n_sec_in_level):
    # Get the suggested title for each section
    docs_to_suggest_title_section = [super_sections_docs[i].metadata for i in index_docs_in_section[i_section]]
    docs_to_suggest_title_section = docs_to_suggest_title_section[0:SURVEY_STRUCTURE['n_papers_to_suggest_title_section']]
    result_for_section = suggest_title_for_section(docs_to_suggest_title_section, query_super_section)
    sections[i_section]['section_title'] = result_for_section['title']

    # Now that we have a title for the section, we can use the vector_db to retrieve the references for this section with
    # greater precision
    # Set the query subsection as all the name of the sections untill now, but removing the survey_topic_query of the variable
    # We don't want the survey_topic_query at the begining because we will reuse this variable later without this information
    query_section = f"{query_super_section} - {sections[i_section]['section_title']}"
    query_section = query_section[len(SURVEY_TOPIC_QUERY)+3:] if query_section.startswith(SURVEY_TOPIC_QUERY) else query_section
    docs_to_use_as_ref = vector_db_all_documents.similarity_search(f"{SURVEY_TOPIC_QUERY} - {query_section}", SURVEY_STRUCTURE['max_papers_per_section_as_ref'])
    sections[i_section]['papers'] = [doc.metadata for doc in docs_to_use_as_ref]

    # Save and print if debug=True
    if (DEBUG):
      sections[i_section]['result_for_section'] = result_for_section
      print(f'Results for section {i_section}')
      print(result_for_section)
      print('.'*100)

    # Now, populate each subsection
    if (len(sections[i_section]['subsections']) > 0):
      if (DEBUG):
        print('Extracting subsections...')
      # Extract the top docs of the query to extract the subsections:
      docs_in_section = vector_db_all_documents.similarity_search(query_section, SURVEY_STRUCTURE['n_papers_to_cluster_in_subsections'])
      # Todos os embeddings já foram calculados previamente e estão na variável all_documents_embeddings
      # Daria pra pegar diretamente deles se tivéssemos os índices de docs_in_section.
      # É possível também simplesmente varrer o all_documents buscando os documentos iguais ao dos docs_in_section
      # e pegar o embedding correspondente. Mas pra facilitar o código e como isso aqui é rápido, vamos só calcular novamente mesmo
      docs_embeddings_in_section = embeddings.embed_documents([doc.page_content for doc in docs_in_section])

      populate_sections_with_suggested_titles(sections[i_section]['subsections'], docs_in_section, docs_embeddings_in_section, query_section, random_state)

In [123]:
%%time
sections = []

populate_with_empty_sections(sections, SURVEY_STRUCTURE['n_sections'])
populate_sections_with_suggested_titles(sections=sections,
                                        super_sections_docs=all_documents,
                                        super_sections_docs_embeddings=all_documents_embeddings,
                                        query_super_section=SURVEY_TOPIC_QUERY)

Cluster 4: 496
Cluster 2: 836
Cluster 0: 562
Cluster 1: 496
Cluster 3: 839
Extracting 5 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'text neural information retrieval'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'text neural information retrieval'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'text neural information retrieval' that are present in most abstracts.
The second property, called "title", is the title of the section tha

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Cluster 0: 12
Cluster 2: 12
Cluster 1: 26
Extracting 3 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'Semantic Fusion Strategies for Text Neural Information Retrieval'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'Semantic Fusion Strategies for Text Neural Information Retrieval'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'Semantic Fusion Strategies for Text Neural Information Retrieval' that are present in most abstracts.
The seco

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Cluster 2: 12
Cluster 1: 29
Cluster 0: 9
Extracting 3 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'Multi-modal Retrieval in Text Neural Information Retrieval'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'Multi-modal Retrieval in Text Neural Information Retrieval'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'Multi-modal Retrieval in Text Neural Information Retrieval' that are present in most abstracts.
The second property, called

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Cluster 2: 22
Cluster 1: 17
Cluster 0: 11
Extracting 3 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'Advancements in Neural Information Retrieval for Text Analysis'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'Advancements in Neural Information Retrieval for Text Analysis'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'Advancements in Neural Information Retrieval for Text Analysis' that are present in most abstracts.
The second pro

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Cluster 1: 9
Cluster 2: 10
Cluster 0: 31
Extracting 3 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'Advancements in Text-based Neural Information Retrieval'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'Advancements in Text-based Neural Information Retrieval'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'Advancements in Text-based Neural Information Retrieval' that are present in most abstracts.
The second property, called "title",

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Cluster 0: 33
Cluster 2: 7
Cluster 1: 10
Extracting 3 sections in this level:


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a renowned scientist who is writing a survey on 'text neural information retrieval'. You are currently writing a section about 'Deep Learning Approaches for Text Classification in Scientific Surveys'
Human: I will send you a list of title and abstract of scientific articles. Most of them cover a specific topic about the section 'Deep Learning Approaches for Text Classification in Scientific Surveys'.
Your task is to find out what this topic is and suggest a good title for a section in a scientific survey that addresses it.
You should also explain your reasoning.
Your answer should be a valid RFC8259 compliant JSON object with three properties:
The first property, called "topic", describes the main topic that is a subset of 'Deep Learning Approaches for Text Classification in Scientific Surveys' that are present in most ab

In [127]:
# Pretty print
def print_sections(sections, level=0):
  for section in sections:
    section_title = '\t'*(level) + section['section_title']
    print(section_title)
    if len(section['subsections']) > 0:
      print_sections(section['subsections'], level+1)

print_sections(sections)

Semantic Fusion Strategies for Text Neural Information Retrieval
	Deep Learning Approaches for Misinformation Detection in Text
	Graph Neural Networks for Text Classification
	Enhancing Text Classification with Semantic Fusion Strategies
Multi-modal Retrieval in Text Neural Information Retrieval
	Enhancing Text Neural Information Retrieval with Named Entities
	Enhancing Cross-Modal Retrieval in Text Neural Information Retrieval
	Pattern Separation and Completion in Memory Retrieval
Advancements in Neural Information Retrieval for Text Analysis
	Advancements in Neural Networks for Text Classification
	Advancements in Sentiment Analysis for Social Media Text
	Advancements in Deep Learning for Text Classification
Advancements in Text-based Neural Information Retrieval
	Advancements in Knowledge-infused Attention Mechanism for Text Classification
	Advancements in Text-to-Image Synthesis: Generating Images from Text
	Advancements in Neural Networks for Text Classification
Deep Learning Appr

In [128]:
save_file(sections, f'sections-{SURVEY_TOPIC_QUERY}-{SURVEY_FILTERS["year"]}-{PARAMS["gpt_model_name"]}.pkl')

In [126]:
sections[0]['result_for_section']

{'topic': 'Semantic Information Fusion for Text Neural Information Retrieval',
 'title': 'Semantic Fusion Strategies for Text Neural Information Retrieval',
 'reasoning': "Based on the abstracts provided, the common theme among the articles is the fusion of different sources of information to improve text neural information retrieval. This includes the fusion of Japanese text and voice recognition, visual saliency sharing for 3D model recognition, deep fusion CNN for image retrieval, hierarchical attentive heterogeneous graph network for extractive summarization, self-supervised spatial recurrent network for content-based image retrieval, contextualized query embeddings for conversational search, leveraging equivariant features for absolute pose regression, semantically enriched word embeddings for text classification, cognitive reasoning for complex information retrieval, and recurrent neural ordinary differential equations for social media text classification. Therefore, the main top