In [57]:
with open(file='prompt.md', mode='r', encoding='utf-8') as file:
    prompt = file.read()

prompt

'You are a highly precise information extractor designed to identify key research entities within an essay abstract.  You are very strict; you will absolutely not associate or produce information that is not in the original text. Adhere strictly to the following guidelines:\n\n# Core Principles:\n\n* Focus Exclusively on the Abstract: Base your analysis solely on the provided abstract text.\n\n* Identify Research Entities: Pinpoint the distinct concepts, methods, variables, or objects that are central to the research investigation.\n\n* Eliminate Duplicates: Ensure each identified entity is unique within the output.\n\n* Resolve Ambiguity: If an entity\'s meaning is unclear, deduce its most likely interpretation based on the context.\n\n* Simplify Complex Terminology:  If technical jargon is present, strive to express the entity in more accessible language.\n\n* Highlight Novel Paradigms: If the abstract proposes a new model, framework, or approach, explicitly include it in the output.

In [61]:
import ollama
import json
import openai
def extract_entity(abstract:str):

#     openai.api_key = "YOUR_API_KEY" # openai version
#     response = openai.ChatCompletion.create(
#     model="gpt-40", 
#     messages=[
#         {"role": "user", "content": prompt + abstract}
#     ],
#     temperature=0  
# )

#     try:
#         return json.loads(response.choices[0].message.content)
#     except json.JSONDecodeError: 
#         return "Not JSON format"

    response = ollama.chat(model='llama3.1', options={"temperature":0}, messages=[{
        'role':'user',
        'content': prompt + abstract
    }])
    
    try: return(json.loads(response['message']['content']))

    except json.JSONDecodeError: 
        return "Not JSON format"


In [62]:
test_abstract = 'Abstract:This paper presents the second part of the two-part survey series on decomposition-based evolutionary multi-objective optimization where we mainly focus on discussing the literature re-lated to multi-objective evolutionary algorithms based on decomposition (MOEA/D). Complementary to the first part, here we employ a series of advanced data mining approaches to provide a compre-hensive anatomy of the enormous landscape of MOEA/D research, which is far beyond the capacity of classic manual literature review protocol. In doing so, we construct a heterogeneous knowledge graph that encapsulates more than 5, 400 papers, 10, 000 authors, 400 venues, and 1, 600 institutions for MOEA/D research. We start our analysis with basic descriptive statistics. Then we delve into prominent research/application topics pertaining to MOEA/D with state-of-the-art topic modeling techniques and interrogate their sptial-temporal and bilateral relationships. We also explored the col-laboration and citation networks of MOEA/D, uncovering hidden patterns in the growth of literature as well as collaboration between researchers. Our data mining results here, combined with the ex-pert review in Part I1, together offer a holistic view of the MOEA/D research, and demonstrate the potential of an exciting new paradigm for conducting scientific surveys from a data science perspective. Keywords: Multi-objective optimization, decomposition, data mining, topic modeling, network analysis, data visualization.'
extract_entity(abstract=test_abstract)

{'entity1': 'decomposition-based evolutionary multi-objective optimization',
 'entity2': 'MOEA/D',
 'entity3': 'multi-objective evolutionary algorithms',
 'entity4': 'data mining approaches',
 'entity5': 'knowledge graph',
 'entity6': 'topic modeling techniques',
 'entity7': 'citation networks',
 'entity8': 'decomposition'}

In [49]:
import pandas as pd
test = pd.read_csv('source_document.csv')
test = test[:100]


In [50]:
len(test)

100

In [51]:
entities = []
for n in range(len(test)):
    entities.append(extract_entity(test.iloc[n, 1]))
test['entity'] = entities
test
    

Unnamed: 0,Title,Abstract,entity
0,End-to-end attention-based large vocabulary sp...,Many state-of-the-art Large Vocabulary Continu...,{'entity1': 'Large Vocabulary Continuous Speec...
1,Deep contextualized word representations,We introduce a new type of deep contextualized...,{'entity1': 'deep contextualized word represen...
2,End-to-end memory networks,We introduce a neural network with a recurrent...,"{'entity1': 'neural network', 'entity2': 'recu..."
3,DeepTox: Toxicity prediction using deep learning,The Tox21 Data Challenge has been the largest ...,"{'entity1': 'Tox21 Data Challenge', 'entity2':..."
4,Language models as knowledge bases?,Recent progress in pretraining language models...,"{'entity1': 'pretraining language models', 'en..."
...,...,...,...
95,A state-of-the-art survey on deep learning the...,"In recent years, deep learning has garnered tr...","{'entity1': 'Deep Learning', 'entity2': 'Machi..."
96,“So what if ChatGPT wrote it?” Multidisciplina...,"Transformative artificially intelligent tools,...","{'entity1': 'ChatGPT', 'entity2': 'generative ..."
97,Sequence-level knowledge distillation,Neural machine translation (NMT) offers a nove...,{'entity1': 'Neural machine translation (NMT)'...
98,Transformers in Vision: A Survey,Astounding results from Transformer models on ...,"{'entity1': 'Transformer models', 'entity2': '..."


In [52]:
test.loc[:, ['Title', 'entity']]

Unnamed: 0,Title,entity
0,End-to-end attention-based large vocabulary sp...,{'entity1': 'Large Vocabulary Continuous Speec...
1,Deep contextualized word representations,{'entity1': 'deep contextualized word represen...
2,End-to-end memory networks,"{'entity1': 'neural network', 'entity2': 'recu..."
3,DeepTox: Toxicity prediction using deep learning,"{'entity1': 'Tox21 Data Challenge', 'entity2':..."
4,Language models as knowledge bases?,"{'entity1': 'pretraining language models', 'en..."
...,...,...
95,A state-of-the-art survey on deep learning the...,"{'entity1': 'Deep Learning', 'entity2': 'Machi..."
96,“So what if ChatGPT wrote it?” Multidisciplina...,"{'entity1': 'ChatGPT', 'entity2': 'generative ..."
97,Sequence-level knowledge distillation,{'entity1': 'Neural machine translation (NMT)'...
98,Transformers in Vision: A Survey,"{'entity1': 'Transformer models', 'entity2': '..."


In [53]:
test.iloc[0, 0]

'End-to-end attention-based large vocabulary speech recognition'

In [54]:
test.iloc[0, 2]

{'entity1': 'Large Vocabulary Continuous Speech Recognition (LVCSR) Systems',
 'entity2': 'neural networks',
 'entity3': 'Hidden Markov Models (HMMs)',
 'entity4': 'Connectionist Temporal Classification modules',
 'entity5': 'attention mechanism',
 'entity6': 'Recurrent Neural Network (RNN)',
 'entity7': 'n-gram language model',
 'entity8': 'Wall Street Journal corpus'}

In [55]:
test.iloc[98, 0]

'Transformers in Vision: A Survey'

In [56]:
test.iloc[98, 2]

{'entity1': 'Transformer models',
 'entity2': 'computer vision problems',
 'entity3': 'Long short-term memory',
 'entity4': 'convolutional networks',
 'entity5': 'self-attention',
 'entity6': 'large-scale pre-training',
 'entity7': 'bidirectional feature encoding',
 'entity8': 'Transformer models in computer vision discipline'}