In [1]:
from PubMed.PmcScraper import PmcScraper
import os
import ast
import spacy
import openai

nlp = spacy.load("en_core_web_trf")

# need .env file to store ncbi api key
pmc_scraper = PmcScraper(os.environ['NCBI_API_KEY'])

# search based on keyword and retrieve the desired amount of articles
search_results = pmc_scraper.search_pmc(search_term="gut microbiota", number_of_articles=1)

# extract the full text version of articles using pmcids
full_text_articles = pmc_scraper.fetch_full_text(search_results['esearchresult']['idlist'])

# extract the desired section of articles
method_sections = pmc_scraper.extract_section(full_text_articles, 'method')

# print as dictionary
method_dict = dict(zip(search_results['esearchresult']['idlist'], method_sections))

In [2]:
method_dict

{'11198919': 'Methods:This was a cross-sectional multicenter study enrolling children aged 0–48 months, attending pediatric clinics. Questionnaires evaluated the clinical history, symptoms, and sociodemographic information. FGIDs were defined according to Rome IV criteria. PATIENTS AND METHODS Study design We performed a prospective, cross-sectional, multicenter study to assess the hospital-based prevalence of FGIDs in five regions of Saudi Arabia (Center, North, East, West, and South) including infants and toddlers, aged 0–4 years. Recruitment was conducted in general pediatric clinics located in six cities: Jeddah (King Abdulaziz Medical City and Soliman Fakeeh Hospital), Riyadh (King Abdulaziz Medical City and King Faisal Hospital and Research Center, and King Saud University Medical City), Tabouk (Tabuk Armed Forces Hospital), Al-Madinah Al-Munawarah (Maternal Children Hospital), Khamis Mushait (Maternal Children Hospital), and Dammam (Maternal Children Hospital). We recruited the 

In [3]:
entity_dict = {}
for key, value in method_dict.items():
    doc = nlp(value)
    entities = []
    for ent in doc.ents:
        if ent.label_ == 'PRODUCT':
            entities.append(ent.text)
    entity_dict[key] = entities

# Article Number 11189522

## Extracting Products

In [3]:
# extract the full text version of articles using pmcids
full_text_articles = pmc_scraper.fetch_full_text(['11189522'])

# extract the desired section of articles
method_sections = pmc_scraper.extract_section(full_text_articles, 'method')

In [4]:
method_sections

['Materials and methods Materials All chemicals were obtained from Sigma (St Louis, MO, USA) unless otherwise stated. 2’-fucosyllactose (2’FL) and 3-fucosyllactose (3FL) were obtained from Glycom/DSM (Esbjerg, Denmark). Blood group A type II (BgA), Blood group B type II (BgB), Blood group H type II (BgH) and LewisY (LeY) were obtained from Elicityl (Crolles, France). Lewis A trisaccharide (LeA), 3′-sialyl Lewis A (sLeA), Lewis X trisaccharide (LeX), 3’-sialyl Lewis X (sLeX), 2-acetamido-2-deoxy-6-O-(α-l-fucopyranosyl)-d-glucopyranose (6FN), 2-acetamido-2-deoxy-4-O-(α-l-fucopyranosyl)-d-glucopyranose (4FN), 2-acetamido-2-deoxy-3-O-(α-l-fucopyranosyl)-d-glucopyranose (3FN), 4-nitrophenyl α-l-fucopyranoside (pNP-Fuc), 2-Chloro-4-nitrophenyl-αl-fucopyranoside (CNP-Fuc), 2-Chloro-4-nitrophenol (CNP) and N-acetyllactosamine (LacNAc) were obtained from Biosynth Ltd (Compton, UK). FA2G2 N-glycan was from Ludger (Oxford, UK). IgG was purified from human serum using the protein A IgG purificatio

In [5]:
doc = nlp(method_sections[0])
products = []
for ent in doc.ents:
    if ent.label_ == 'PRODUCT' and ent.text not in products:
        products.append(ent.text)

In [6]:
products

['FA2G2 N-glycan',
 'R. gnavus E1',
 'R. gnavus',
 'GH29',
 'the Enzyme Function Initiative-Enzyme Similarity Tool',
 'Cytoscape 3.9.1',
 'GH29BERT',
 'Transformer',
 'A100 40GB',
 'ProtT5 model29',
 'Uniform Manifold Approximation and Projection',
 'Python',
 'NuPAGE Novex',
 'Bis-Tris',
 'NanoDrop',
 'FLUOstar Omega',
 'HPAEC-PAD',
 'ICS 5000',
 'CarboPac PA1',
 'Savant SpeedVac',
 'a LudgerClean Procainamide Plate',
 'LC-PROC-96',
 'Ultimate 3000',
 'Amazon Speed ETD',
 'an Expression Compact Mass Spectrometer',
 'Plate Express',
 'Advion Mass Express',
 'Bruker Avance',
 'Fucose',
 '6FN',
 '3FN',
 '4FN',
 'Bruker Avance III',
 'FA2G2',
 'xia279',
 'dials80',
 '1ODU',
 'Chainsaw81',
 'ArpWarp82',
 'coot83',
 'refmac84',
 'PDBredo85']

### Open Source LLMs

In [7]:
instruct_string = """You are an expert Physical Commercial Product Recognition system.
Your task is to accept a list of strings as input and decide which of these are physical commercial products.
Here is an example of the output format for a list of strings.

List of strings: ["Python", "3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Microsoft Office", "Braun ThermoScan", "SPSS"]
Answer: ["3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Braun ThermoScan"]

Only use this output format. Do not output anything besides this output format.
Output physical commercial products in the order they occur in the input list of strings.

Q: Given the list of strings below, identify the physical commercial products.

List of strings: {}
Answer:
"""

In [12]:
import ollama

stream = ollama.chat(
    model='llama3',
    messages=[{'role': 'user', 'content': instruct_string.format(products)}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

['NuPAGE Novex', 'Bis-Tris', 'NanoDrop', 'FLUOstar Omega', 'HPAEC-PAD', 'ICS 5000', 'CarboPac PA1', 'Savant SpeedVac', 'Ultimate 3000', 'Bruker Avance III', 'Chainsaw81']

In [13]:
stream = ollama.chat(
    model='mistral',
    messages=[{'role': 'user', 'content': instruct_string.format(products)}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

 ['NuPAGE Novex', 'NanoDrop', 'FLUOstar Omega', 'ICS 5000', 'CarboPac PA1', 'Savant SpeedVac', 'a LudgerClean Procainamide Plate', 'LC-PROC-96', 'Ultimate 3000', 'Amazon Speed ETD', 'an Expression Compact Mass Spectrometer', 'Plate Express', 'Advion Mass Express', 'Bruker Avance', 'Fucose']

### OpenAI

In [7]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [8]:
system_prompt = """You are an expert Physical Commercial Product Recognition system.
Your task is to accept a list of strings as input and decide which of these are physical commercial products.
Here is an example of the output format for a list of strings.

List of strings: ["Python", "3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Microsoft Office", "Braun ThermoScan", "SPSS"]
Answer: ["3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Braun ThermoScan"]

Only use this output format. Do not output anything besides this output format.
Output physical commercial products in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the physical commercial products.

List of strings: {}
Answer:
"""

In [9]:
response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(products)}
        ]
    )

In [10]:
extracted_entities_string = response.choices[0].message.content
print(extracted_entities_string)

["A100 40GB", "NuPAGE Novex", "Bis-Tris", "NanoDrop", "FLUOstar Omega", "HPAEC-PAD", "ICS 5000", "CarboPac PA1", "Savant SpeedVac", "a LudgerClean Procainamide Plate", "LC-PROC-96", "Ultimate 3000", "Amazon Speed ETD", "an Expression Compact Mass Spectrometer", "Plate Express", "Advion Mass Express", "Bruker Avance", "Bruker Avance III"]


In [11]:
extracted_entities = ast.literal_eval(extracted_entities_string)
print(extracted_entities)

['A100 40GB', 'NuPAGE Novex', 'Bis-Tris', 'NanoDrop', 'FLUOstar Omega', 'HPAEC-PAD', 'ICS 5000', 'CarboPac PA1', 'Savant SpeedVac', 'a LudgerClean Procainamide Plate', 'LC-PROC-96', 'Ultimate 3000', 'Amazon Speed ETD', 'an Expression Compact Mass Spectrometer', 'Plate Express', 'Advion Mass Express', 'Bruker Avance', 'Bruker Avance III']


In [12]:
def remove_stopwords(input_string):
    # can add more stopwords here
    # use NLTK is these stopwords are not enough
    stopwords = ['a', 'an', 'the']
    tokens = input_string.split()
    token_list = []
    for token in tokens:
        if token not in stopwords:
            token_list.append(token)
    output_string = " ".join(token_list)
    return output_string

In [13]:
cleaned_entites = [remove_stopwords(entity) for entity in extracted_entities]
print(cleaned_entites)

['A100 40GB', 'NuPAGE Novex', 'Bis-Tris', 'NanoDrop', 'FLUOstar Omega', 'HPAEC-PAD', 'ICS 5000', 'CarboPac PA1', 'Savant SpeedVac', 'LudgerClean Procainamide Plate', 'LC-PROC-96', 'Ultimate 3000', 'Amazon Speed ETD', 'Expression Compact Mass Spectrometer', 'Plate Express', 'Advion Mass Express', 'Bruker Avance', 'Bruker Avance III']


## Extract Company/Brand Names from Product Entites

### GPT Approach

#### Extract Brand Names From Product Itself

In [15]:
system_prompt = """You are an expert Company Recognition system.
Your task is to accept a list of strings as input extract the company names from these strings.
If some of the strings do not contain a company name, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["Sony PlayStation 5", "Bose QuietComfort 45 Headphones", "Flow Cytometer FlowFlex", "AMD Ryzen 9 5900X", "ThermoMixer", "Stainless Steel Tumbler"]
Answer: ["Sony", "Bose", None, "AMD", None, None]

Only use this output format. Do not output anything besides this output format.
Output company names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, extract the company names.

List of strings: {}
Answer:
"""

In [16]:
response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(cleaned_entites)}
        ]
    )

In [17]:
extracted_companies_string = response.choices[0].message.content

In [18]:
print(extracted_companies_string)

[None, None, None, None, None, None, None, None, None, None, None, None, "Amazon", None, None, None, "Bruker", "Bruker"]


#### Identify Brand Names Using Internal Knowledge

In [14]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each product mentioned.
Strings provided in the list can be misleading; use your internal knowledge to identify company names.
If the manufacturer is unclear for any product, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["PlayStation 5", "iPhone 15 Plus", "Pencil", "Stainless Steel Tumbler", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [15]:
response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(cleaned_entites)}
        ]
    )

In [16]:
extracted_companies_string = response.choices[0].message.content

In [17]:
extracted_companies = ast.literal_eval(extracted_companies_string)
print(extracted_companies)

['Amazon', 'Thermo Fisher Scientific', 'Thermo Fisher Scientific', 'Thermo Fisher Scientific', 'BMG Labtech', 'Thermo Fisher Scientific', 'Thermo Fisher Scientific', 'Thermo Fisher Scientific', 'Thermo Fisher Scientific', 'Ludger', 'Ludger', 'Thermo Fisher Scientific', 'Bruker', 'Advion', 'Advion', 'Advion', 'Bruker', 'Bruker']


In [18]:
dict(zip(cleaned_entites, extracted_companies))

{'A100 40GB': 'Amazon',
 'NuPAGE Novex': 'Thermo Fisher Scientific',
 'Bis-Tris': 'Thermo Fisher Scientific',
 'NanoDrop': 'Thermo Fisher Scientific',
 'FLUOstar Omega': 'BMG Labtech',
 'HPAEC-PAD': 'Thermo Fisher Scientific',
 'ICS 5000': 'Thermo Fisher Scientific',
 'CarboPac PA1': 'Thermo Fisher Scientific',
 'Savant SpeedVac': 'Thermo Fisher Scientific',
 'LudgerClean Procainamide Plate': 'Ludger',
 'LC-PROC-96': 'Ludger',
 'Ultimate 3000': 'Thermo Fisher Scientific',
 'Amazon Speed ETD': 'Bruker',
 'Expression Compact Mass Spectrometer': 'Advion',
 'Plate Express': 'Advion',
 'Advion Mass Express': 'Advion',
 'Bruker Avance': 'Bruker',
 'Bruker Avance III': 'Bruker'}