# Extract Method Section

In [1]:
from PubMed.PmcScraper import PmcScraper
import os
import ast
import spacy
import openai

nlp = spacy.load("en_core_web_trf")

# need .env file to store ncbi api key
pmc_scraper = PmcScraper(os.environ['NCBI_API_KEY'])

# search based on keyword and retrieve the desired amount of articles
search_results = pmc_scraper.search_pmc(search_term="gut microbiota", number_of_articles=30)

# extract the full text version of articles using pmcids
full_text_articles = pmc_scraper.fetch_full_text(search_results['esearchresult']['idlist'])

# extract the desired section of articles
method_sections = pmc_scraper.extract_section(full_text_articles, 'method')

# print as dictionary
method_dict = dict(zip(search_results['esearchresult']['idlist'], method_sections))

In [2]:
method_dict

{'11198919': 'Methods:This was a cross-sectional multicenter study enrolling children aged 0–48 months, attending pediatric clinics. Questionnaires evaluated the clinical history, symptoms, and sociodemographic information. FGIDs were defined according to Rome IV criteria. PATIENTS AND METHODS Study design We performed a prospective, cross-sectional, multicenter study to assess the hospital-based prevalence of FGIDs in five regions of Saudi Arabia (Center, North, East, West, and South) including infants and toddlers, aged 0–4 years. Recruitment was conducted in general pediatric clinics located in six cities: Jeddah (King Abdulaziz Medical City and Soliman Fakeeh Hospital), Riyadh (King Abdulaziz Medical City and King Faisal Hospital and Research Center, and King Saud University Medical City), Tabouk (Tabuk Armed Forces Hospital), Al-Madinah Al-Munawarah (Maternal Children Hospital), Khamis Mushait (Maternal Children Hospital), and Dammam (Maternal Children Hospital). We recruited the 

# Identify Product Entities

In [3]:
entity_dict = {}
for key, value in method_dict.items():
    if value:
        doc = nlp(value)
        entities = []
        for ent in doc.ents:
            if ent.label_ == 'PRODUCT' and ent.text not in entities:
                entities.append(ent.text)
        entity_dict[key] = entities

In [4]:
entity_dict

{'11198919': ['Rome IV',
  'the Statistical Package for the Social Sciences',
  'SPSS',
  'Windows'],
 '11198832': ['Western Blot',
  'SCIEX',
  '2720',
  'FLX800',
  'Nanodrop UV',
  'NC2000',
  'Sequencer',
  'Novaseq6000',
  'Quant-iT PicoGreen dsDNA Assay Kit',
  'First Strand',
  'SYBR Green',
  'Servicebio',
  'GAPDH',
  'The Odyssey CLx Imaging System'],
 '11198784': ['Ficoll-Paque Premium',
  'TC20 Automated cell counter',
  'an EasySep Negative Human NK Cell Isolation Kit',
  'Micro Forge MF-900',
  'Axopatch 200B',
  'PregS',
  'Ononetin',
  'GraphPad Prism v9',
  'GraphPad Prism version'],
 '11198029': ['Embase'],
 '11197919': ['BioRender',
  'RS4',
  'a Tanita SC-240 Total Body Composition Analyzer',
  'the Automated Self-Administered 24-hour',
  'ASA24',
  'the QIAamp 96 DNA Blood Kit',
  'the QIAamp Investigator Kit',
  'iTaq™ Universal SYBR® Green Supermix',
  'NA12286',
  'AMY1',
  'a QX100 Droplet Digital PCR System',
  'SALIMETRICS',
  'the DNeasy PowerSoil 96 HTP Kit

# Filtering Products with LLM

In [18]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [25]:
system_prompt = """You are an expert Physical Commercial Product Recognition system.
Your task is to accept a list of strings as input and extract the physical commercial products.
Here is an example of the output format for a list of strings.

List of strings: ["Python", "Researchers used 3M Littmann Stethoscope", "Omron Blood Pressure Monitor is important", "Microsoft Office", "Braun ThermoScan", "SPSS"]
Answer: ["3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Braun ThermoScan"]

Only use this output format. Do not return anything besides this output format.
Output physical commercial products in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the physical commercial products.

List of strings: {}
Answer:
"""

In [26]:
def product_filtering(list_of_products):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(list_of_products)}
        ]
    )
    return response.choices[0].message.content

In [27]:
filtered_entity_dict = {}
syntax_error = False
for key, value in entity_dict.items():
    if value:
        filtered_products = product_filtering(value)
        try:
            filtered_products = ast.literal_eval(filtered_products)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            filtered_entity_dict[key] = filtered_products

11198919: []
11198832: ["Nanodrop UV", "Novaseq6000", "Quant-iT PicoGreen dsDNA Assay Kit", "The Odyssey CLx Imaging System"]
11198784: ["Ficoll-Paque Premium", "TC20 Automated cell counter", "EasySep Negative Human NK Cell Isolation Kit", "Micro Forge MF-900", "Axopatch 200B"]
11198029: []
11197919: ["Tanita SC-240 Total Body Composition Analyzer", "QIAamp 96 DNA Blood Kit", "QIAamp Investigator Kit", "QX100 Droplet Digital PCR System", "DNeasy PowerSoil 96 HTP Kit", "BioSpec 1001 Mini-Beadbeater-96"]
11197476: ["Malvern Nano ZS ZEN3600"]
11197378: []
11197282: []
11197204: ["TRIzol", "Easy Oligos", "iTaq Universal One-Step", "iTaq™ Universal SYBR® Green One-Step"]
11197185: ["MiSeq", "Hemotek", "Trizol", "AccuStart II", "MiSeq 500"]
11196966: ["Isoflurane Piramal", "the SomnoSuite® Low-Flow Anesthesia System", "the Liver Dissociation Kit", "the Tumor Dissociation Kit", "the RNeasy Mini Kit", "ProtoScript II First Strand cDNA Synthesis Kit", "the DNeasy Blood & Tissue Kit", "the Quant

In [28]:
for key, value in filtered_entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Nanodrop UV', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B']
11197919: ['Tanita SC-240 Total Body Composition Analyzer', 'QIAamp 96 DNA Blood Kit', 'QIAamp Investigator Kit', 'QX100 Droplet Digital PCR System', 'DNeasy PowerSoil 96 HTP Kit', 'BioSpec 1001 Mini-Beadbeater-96']
11197476: ['Malvern Nano ZS ZEN3600']
11197204: ['TRIzol', 'Easy Oligos', 'iTaq Universal One-Step', 'iTaq™ Universal SYBR® Green One-Step']
11197185: ['MiSeq', 'Hemotek', 'Trizol', 'AccuStart II', 'MiSeq 500']
11196966: ['Isoflurane Piramal', 'the SomnoSuite® Low-Flow Anesthesia System', 'the Liver Dissociation Kit', 'the Tumor Dissociation Kit', 'the RNeasy Mini Kit', 'ProtoScript II First Strand cDNA Synthesis Kit', 'the DNeasy Blood & Tissue Kit', 'the QuantiFast SYBR® Green PCR Kit', 'a Pierce BCA-Kit', 'an 

In [22]:
for key, value in entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198919: ['Rome IV', 'the Statistical Package for the Social Sciences', 'SPSS', 'Windows']
11198832: ['Western Blot', 'SCIEX', '2720', 'FLX800', 'Nanodrop UV', 'NC2000', 'Sequencer', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'First Strand', 'SYBR Green', 'Servicebio', 'GAPDH', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'an EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B', 'PregS', 'Ononetin', 'GraphPad Prism v9', 'GraphPad Prism version']
11198029: ['Embase']
11197919: ['BioRender', 'RS4', 'a Tanita SC-240 Total Body Composition Analyzer', 'the Automated Self-Administered 24-hour', 'ASA24', 'the QIAamp 96 DNA Blood Kit', 'the QIAamp Investigator Kit', 'iTaq™ Universal SYBR® Green Supermix', 'NA12286', 'AMY1', 'a QX100 Droplet Digital PCR System', 'SALIMETRICS', 'the DNeasy PowerSoil 96 HTP Kit', 'Qiagen', 'a BioSpec 1001 Mini-Beadbeater-96 for 3 minutes', 'Mag-Bind TotalPure', 'Omeg

### Cleaning Product Entities

In [29]:
def remove_stopwords(input_string):
    # can add more stopwords here
    # use NLTK if these stopwords are not enough
    stopwords = ['a', 'an', 'the', 'of']
    tokens = input_string.split()
    token_list = []
    for token in tokens:
        if token not in stopwords:
            token_list.append(token)
    output_string = " ".join(token_list)
    return output_string

In [30]:
cleaned_entity_dict = {}
for key, value in filtered_entity_dict.items():
    if value:
        cleaned_entities = [remove_stopwords(entity) for entity in value]
        cleaned_entity_dict[key] = cleaned_entities

In [None]:
for key, value in cleaned_entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Nanodrop UV', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B']
11197919: ['Tanita SC-240 Total Body Composition Analyzer', 'QIAamp 96 DNA Blood Kit', 'QIAamp Investigator Kit', 'QX100 Droplet Digital PCR System', 'DNeasy PowerSoil 96 HTP Kit', 'BioSpec 1001 Mini-Beadbeater-96']
11197476: ['Malvern Nano ZS ZEN3600']
11197204: ['TRIzol', 'Easy Oligos', 'iTaq Universal One-Step', 'iTaq™ Universal SYBR® Green One-Step']
11197185: ['MiSeq', 'Hemotek', 'Trizol', 'AccuStart II', 'MiSeq 500']
11196966: ['Isoflurane Piramal', 'SomnoSuite® Low-Flow Anesthesia System', 'Liver Dissociation Kit', 'Tumor Dissociation Kit', 'RNeasy Mini Kit', 'ProtoScript II First Strand cDNA Synthesis Kit', 'DNeasy Blood & Tissue Kit', 'QuantiFast SYBR® Green PCR Kit', 'Pierce BCA-Kit', 'Azure Ao Microplate Reader', 

# Identify Products with LLM

In [3]:
system_prompt = """You are an expert Physical Commercial Product Recognition system.
Your task is to accept a text as input and extract the physical commercial products.
Here is an example of the output format for a list of strings.

Text: I started my day with a fresh cup of coffee brewed from my Keurig coffee maker, and then I prepared breakfast using my new KitchenAid stand mixer. After breakfast, I grabbed my Nike running shoes and went for a jog in the park. On my way back, I stopped by the store to pick up a pack of Tide laundry detergent and a bottle of Coca-Cola.
Answer: ["Keurig coffee maker", "KitchenAid stand mixer", "Nike running shoes", "Tide laundry detergent", "Coca-Cola"]

Only use this output format. Do not return anything besides this output format.
Output physical commercial products in the order they occur in the input text.
"""

user_prompt = """Q: Given the text below, identify the physical commercial products.

Text: {}
Answer:
"""

In [4]:
def product_recognition(input_text):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(input_text)}
        ]
    )
    return response.choices[0].message.content

In [5]:
product_dict = {}
syntax_error = False
for key, value in method_dict.items():
    if value:
        identified_products = product_recognition(value)
        try:
            identified_products = ast.literal_eval(identified_products)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            product_dict[key] = identified_products

In [6]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['high-performance liquid chromatography (HPLC) (Shimadzu, LC-20AD)', 'SCIEX mass spectrometer (MS) (AB SCIEX, API 4000+)', 'PCR amplification instrument (ABI, 2720)', 'enzyme labeling instrument (BioTek, FLX800T)', 'electrophoresis apparatus (Beijing Liuyi Instrument Factory, DYY-6C)', 'gel imaging system (Beijing Bijing Biotechnology Co., LTD., BG-gdsAUTO130)', 'Nanodrop UV quantitative system (Thermo Fisher Scientific, NC2000)', 'Sequencer (Illumina, Novaseq6000)', 'fluorescence quantitative PCR instrument (Bio-rad Corporation, CFX)', 'vancomycin hydrochloride for injection (VIANEX S.A.)', 'bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)', 'NBP capsules (Shiyao Group Enbipu Pharmaceutical)', 'NBP reference (Shiyao Group Enbipu Pharmaceutical)', 'glipizide reference (Sichuan Vicchi Biochemical Technology)', 'Soil DNA Kit (Omega Bio-Tek)', 'Quant-iT PicoGreen dsDNA Assay Kit (ABI)', 'First Strand cDNA Synthesis Kit (Servicebio)', 'SYBR G

# Extracting Company/Brand Names

In [7]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each product mentioned.
Strings provided in the list can be misleading; use your internal knowledge to identify company names.
If the manufacturer is unclear for any product, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["PlayStation 5", "iPhone 15 Plus", "Pencil", "Stainless Steel Tumbler", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [8]:
def company_identification(list_of_products):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(list_of_products)}
        ]
    )
    return response.choices[0].message.content

In [9]:
company_dict = {}
syntax_error = False
for key, value in product_dict.items():
    if value:
        identified_companies = company_identification(value)
        try:
            identified_companies = ast.literal_eval(identified_companies)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            company_dict[key] = identified_companies

## LLM

In [10]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu', 'SCIEX', 'ABI', 'BioTek', 'Beijing Liuyi Instrument Factory', 'Beijing Bijing Biotechnology Co., LTD.', 'Thermo Fisher Scientific', 'Illumina', 'Bio-rad Corporation', 'VIANEX S.A.', 'Hangzhou Yuanda Biological Pharmaceutical', 'Shiyao Group Enbipu Pharmaceutical', 'Shiyao Group Enbipu Pharmaceutical', 'Sichuan Vicchi Biochemical Technology', 'Omega Bio-Tek', 'ABI', 'Servicebio', 'Servicebio']
11198784: ['Cytiva', None, 'Bio-Rad', 'STEMCELL Technologies', None, None, None, 'BD Biosciences', None, 'Sutter Instrument', 'Narishige', None, 'Molecular Devices', 'Molecular Devices', 'Molecular Devices', None, None, None]
11197919: ['Ingredion', 'Ingredion', 'Ingredion', 'Tanita', None, 'Salimetrics', 'Waters Corporation', 'Qiagen', None, None, 'Qiagen', 'Qiagen', 'Qiagen', None]
11197476: ['Bruker', 'Merck', 'Malvern Instruments', 'Thermo Fisher Scientific', 'Beckman Coulter', 'JEOL', 'Nikon', None, None, None, 'BioLegend']
11197382: ['Roche']
11197204: ['Thermo Fisher 

In [11]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['high-performance liquid chromatography (HPLC) (Shimadzu, LC-20AD)', 'SCIEX mass spectrometer (MS) (AB SCIEX, API 4000+)', 'PCR amplification instrument (ABI, 2720)', 'enzyme labeling instrument (BioTek, FLX800T)', 'electrophoresis apparatus (Beijing Liuyi Instrument Factory, DYY-6C)', 'gel imaging system (Beijing Bijing Biotechnology Co., LTD., BG-gdsAUTO130)', 'Nanodrop UV quantitative system (Thermo Fisher Scientific, NC2000)', 'Sequencer (Illumina, Novaseq6000)', 'fluorescence quantitative PCR instrument (Bio-rad Corporation, CFX)', 'vancomycin hydrochloride for injection (VIANEX S.A.)', 'bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)', 'NBP capsules (Shiyao Group Enbipu Pharmaceutical)', 'NBP reference (Shiyao Group Enbipu Pharmaceutical)', 'glipizide reference (Sichuan Vicchi Biochemical Technology)', 'Soil DNA Kit (Omega Bio-Tek)', 'Quant-iT PicoGreen dsDNA Assay Kit (ABI)', 'First Strand cDNA Synthesis Kit (Servicebio)', 'SYBR G

## Spacy

In [35]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Thermo Fisher Scientific', 'Illumina', 'Thermo Fisher Scientific', 'LI-COR Biosciences']
11198784: ['Cytiva', 'Bio-Rad', 'STEMCELL Technologies', 'Narishige', 'Molecular Devices']
11197919: ['Tanita', 'Qiagen', 'Qiagen', 'Bio-Rad', 'Qiagen', 'BioSpec Products']
11197476: ['Malvern Panalytical']
11197204: ['Thermo Fisher Scientific', None, 'Bio-Rad', 'Bio-Rad']
11197185: ['Illumina', None, 'Thermo Fisher Scientific', None, 'Illumina']
11196966: ['Piramal', 'Kent Scientific', None, None, 'Qiagen', 'New England Biolabs', 'Qiagen', 'Qiagen', 'Thermo Fisher Scientific', 'Azure Biosystems', 'Zeiss', 'Vector Laboratories', 'Thermo Fisher Scientific', 'Vector Laboratories']
11196847: ['Thermo Fisher Scientific', None]
11196703: [None, None]
11196417: ['TransGen Biotech', 'Thermo Fisher Scientific', 'TransGen Biotech', 'Vazyme', 'Thermo Fisher Scientific']
11196393: ['Becton Dickinson', None, 'GraphPad Software']
11196377: ['Konica Minolta']
11196265: ['Canon']
11196252: ['Metrohm',

In [36]:
for key, value in cleaned_entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Nanodrop UV', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B']
11197919: ['Tanita SC-240 Total Body Composition Analyzer', 'QIAamp 96 DNA Blood Kit', 'QIAamp Investigator Kit', 'QX100 Droplet Digital PCR System', 'DNeasy PowerSoil 96 HTP Kit', 'BioSpec 1001 Mini-Beadbeater-96']
11197476: ['Malvern Nano ZS ZEN3600']
11197204: ['TRIzol', 'Easy Oligos', 'iTaq Universal One-Step', 'iTaq™ Universal SYBR® Green One-Step']
11197185: ['MiSeq', 'Hemotek', 'Trizol', 'AccuStart II', 'MiSeq 500']
11196966: ['Isoflurane Piramal', 'SomnoSuite® Low-Flow Anesthesia System', 'Liver Dissociation Kit', 'Tumor Dissociation Kit', 'RNeasy Mini Kit', 'ProtoScript II First Strand cDNA Synthesis Kit', 'DNeasy Blood & Tissue Kit', 'QuantiFast SYBR® Green PCR Kit', 'Pierce BCA-Kit', 'Azure Ao Microplate Reader', 