# Extract Method Section

In [1]:
from PubMed.PmcScraper import PmcScraper
import os
import ast
import spacy
import openai
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

In [1]:
# python -m spacy download en_core_web_trf
nlp = spacy.load("en_core_web_trf")

# need .env file to store ncbi api key
pmc_scraper = PmcScraper(os.environ['NCBI_API_KEY'])

# search based on keyword and retrieve the desired amount of articles
search_results = pmc_scraper.search_pmc(search_term="gut microbiota", number_of_articles=1)

# extract the full text version of articles using pmcids
full_text_articles = pmc_scraper.fetch_full_text(search_results['esearchresult']['idlist'])

# extract the desired section of articles
method_sections = pmc_scraper.extract_section(full_text_articles, 'method')

# print as dictionary
method_dict = dict(zip(search_results['esearchresult']['idlist'], method_sections))

In [40]:
method_dict

{'11208022': 'METHODS Animals, Diets, and Baseline Liver Biopsy To obtain diet-induced obese NASH mice (DIO-NASH), male C57BL/6JRj mice aged 5 wk old were fed an amylin diet [AMLN, containing 40% fat (18% trans-fat), 40% carbohydrate (20% fructose), and 2% cholesterol; D09100310, Research Diets] (17) ad libitum for 43 wk to induce advanced histopathological features of NASH. A baseline liver biopsy was performed 3 wk before the drug treatment period as described previously (18). Only those mice (DIO-NASH cohort) with histologically confirmed steatosis ≥ 2 by hematoxylin-eosin stain (H&E) and fibrosis stage ≥ 1 by Picro-Sirius Red (PSR) (19) were stratified and randomly assigned to the treatment groups using a biopsy-based evaluation of liver type 1 collagen α1 (Col1a1) levels. Age-matched normal Chow mice were fed a Chow diet (Altromin 1324, Brogaarden, Denmark) for 43 wk. Throughout the study, the treatment group allocation was blinded to individuals who performed the study. Body weig

In [2]:
article_ids = [
    11198919,
    11198832,
    11198784,
    11198029,
    11197919,
    11197476,
    11197378,
    11197282,
    11197204,
    11197185,
    11196966,
    11196847,
    11196703,
    11196440,
    11196417,
    11196393,
    11196377,
    11196265,
    11196252
]

In [3]:
# extract the full text version of articles using pmcids
full_text_articles = pmc_scraper.fetch_full_text(article_ids)

# extract the desired section of articles
method_sections = pmc_scraper.extract_section(full_text_articles, 'method')

# print as dictionary
method_dict = dict(zip(article_ids, method_sections))

In [4]:
method_dict

{11198919: 'Methods:This was a cross-sectional multicenter study enrolling children aged 0–48 months, attending pediatric clinics. Questionnaires evaluated the clinical history, symptoms, and sociodemographic information. FGIDs were defined according to Rome IV criteria. PATIENTS AND METHODS Study design We performed a prospective, cross-sectional, multicenter study to assess the hospital-based prevalence of FGIDs in five regions of Saudi Arabia (Center, North, East, West, and South) including infants and toddlers, aged 0–4 years. Recruitment was conducted in general pediatric clinics located in six cities: Jeddah (King Abdulaziz Medical City and Soliman Fakeeh Hospital), Riyadh (King Abdulaziz Medical City and King Faisal Hospital and Research Center, and King Saud University Medical City), Tabouk (Tabuk Armed Forces Hospital), Al-Madinah Al-Munawarah (Maternal Children Hospital), Khamis Mushait (Maternal Children Hospital), and Dammam (Maternal Children Hospital). We recruited the pa

# Identify Product Entities with Spacy

In [5]:
entity_dict = {}
for key, value in method_dict.items():
    if value:
        doc = nlp(value)
        entities = []
        for ent in doc.ents:
            if ent.label_ == 'PRODUCT' and ent.text not in entities:
                entities.append(ent.text)
        entity_dict[key] = entities

In [6]:
entity_dict

{11198919: ['Rome IV',
  'the Statistical Package for the Social Sciences',
  'SPSS',
  'Windows'],
 11198832: ['Western Blot',
  'SCIEX',
  '2720',
  'FLX800',
  'Nanodrop UV',
  'NC2000',
  'Sequencer',
  'Novaseq6000',
  'Quant-iT PicoGreen dsDNA Assay Kit',
  'First Strand',
  'SYBR Green',
  'Servicebio',
  'GAPDH',
  'The Odyssey CLx Imaging System'],
 11198784: ['Ficoll-Paque Premium',
  'TC20 Automated cell counter',
  'an EasySep Negative Human NK Cell Isolation Kit',
  'Micro Forge MF-900',
  'Axopatch 200B',
  'PregS',
  'Ononetin',
  'GraphPad Prism v9',
  'GraphPad Prism version'],
 11198029: ['Embase'],
 11197919: ['BioRender',
  'RS4',
  'a Tanita SC-240 Total Body Composition Analyzer',
  'the Automated Self-Administered 24-hour',
  'ASA24',
  'the QIAamp 96 DNA Blood Kit',
  'the QIAamp Investigator Kit',
  'iTaq™ Universal SYBR® Green Supermix',
  'NA12286',
  'AMY1',
  'a QX100 Droplet Digital PCR System',
  'SALIMETRICS',
  'the DNeasy PowerSoil 96 HTP Kit',
  'Qiag

## Filtering Products with LLM

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [8]:
system_prompt = """You are an expert Product Recognition system.
Your task is to accept a list of strings as input and extract the products.
Here is an example of the output format for a list of strings.

List of strings: ["Python", "Researchers used 3M Littmann Stethoscope", "Omron Blood Pressure Monitor is important", "I love travelling", "Braun ThermoScan"]
Answer: ["Python", "3M Littmann Stethoscope", "Omron Blood Pressure Monitor", "Braun ThermoScan"]

Only use this output format. Do not return anything besides this output format.
Output products in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the products.

List of strings: {}
Answer:
"""

In [9]:
def product_filtering(list_of_products):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(list_of_products)}
        ]
    )
    return response.choices[0].message.content

In [10]:
filtered_entity_dict = {}
for key, value in entity_dict.items():
    syntax_error = False
    if value:
        filtered_products = product_filtering(value)
        try:
            filtered_products = ast.literal_eval(filtered_products)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            filtered_entity_dict[key] = filtered_products

### Cleaning Product Entities

In [21]:
def remove_stopwords(input_string):
    # can add more stopwords here
    # use NLTK if these stopwords are not enough
    stopwords = ['a', 'an', 'the', 'of']
    tokens = input_string.split()
    token_list = []
    for token in tokens:
        if token not in stopwords:
            token_list.append(token)
    output_string = " ".join(token_list)
    return output_string

In [22]:
cleaned_entity_dict = {}
for key, value in filtered_entity_dict.items():
    if value:
        cleaned_entities = [remove_stopwords(entity) for entity in value]
        cleaned_entity_dict[key] = cleaned_entities

In [24]:
for key, value in cleaned_entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198919: ['Rome IV', 'Statistical Package for Social Sciences', 'SPSS', 'Windows']
11198832: ['Western Blot', 'SCIEX', '2720', 'FLX800', 'Nanodrop UV', 'NC2000', 'Sequencer', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'First Strand', 'SYBR Green', 'Servicebio', 'GAPDH', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B', 'GraphPad Prism v9', 'GraphPad Prism version']
11198029: ['Embase']
11197919: ['BioRender', 'RS4', 'Tanita SC-240 Total Body Composition Analyzer', 'Automated Self-Administered 24-hour', 'ASA24', 'QIAamp 96 DNA Blood Kit', 'QIAamp Investigator Kit', 'iTaq™ Universal SYBR® Green Supermix', 'QX100 Droplet Digital PCR System', 'SALIMETRICS', 'DNeasy PowerSoil 96 HTP Kit', 'Qiagen', 'BioSpec 1001 Mini-Beadbeater-96', 'Mag-Bind TotalPure', 'Omega Bio', 'MiSeq', 'RStudio', 'ComplexHeatmap40', 'circlize41', 'MaAsLin2', 'glmnet43']
1

# Identify Products with LLM

## OpenAI

In [5]:
system_prompt = """You are an expert Product Recognition system.
Your task is to accept a text as input and extract the products.
If the manufacturer of the product is indicated, include it with the product name.
Here is an example of the output format for a list of strings.

Text: I started my day with a fresh cup of coffee (Folgers) brewed from my coffee maker (Keurig), and then I prepared breakfast using my new KitchenAid stand mixer. After breakfast, I grabbed my running shoes (Nike) and went for a jog in the park. On my way back, I stopped by the store to pick up a pack of Tide laundry detergent and a bottle of Coca-Cola. After getting home, I started working on my laptop, which runs on a Microsoft operating system.
Answer: ["Folgers coffee", "Keurig coffee maker", "KitchenAid stand mixer", "Nike running shoes", "Tide laundry detergent", "Coca-Cola", "Microsoft operating system"]

Only use this output format. Do not return anything besides this output format.
Output products in the order they occur in the input text.
"""

user_prompt = """Q: Given the text below, identify the products.

Text: {}
Answer:
"""

In [6]:
def product_recognition(input_text):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(input_text)}
        ]
    )
    return response.choices[0].message.content

In [7]:
product_dict = {}
for key, value in method_dict.items():
    syntax_error = False
    if value:
        identified_products = product_recognition(value)
        try:
            identified_products = ast.literal_eval(identified_products)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            product_dict[key] = identified_products

In [9]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu high-performance liquid chromatography (HPLC)', 'SCIEX mass spectrometer (MS)', 'ABI PCR amplification instrument', 'BioTek enzyme labeling instrument', 'Beijing Liuyi Instrument Factory electrophoresis apparatus', 'Beijing Bijing Biotechnology Co. gel imaging system', 'Thermo Fisher Scientific Nanodrop UV quantitative system', 'Illumina Sequencer', 'Bio-rad Corporation fluorescence quantitative PCR instrument', 'VIANEX S.A. vancomycin hydrochloride for injection', 'Hangzhou Yuanda Biological Pharmaceutical bifidobacterium quadruplex viable tablets', 'Shiyao Group Enbipu Pharmaceutical NBP capsules', 'Shiyao Group Enbipu Pharmaceutical NBP reference', 'Sichuan Vicchi Biochemical Technology glipizide reference', 'Omega Bio-Tek Soil DNA Kit', 'ABI Quant-iT PicoGreen dsDNA Assay Kit', 'Servicebio First Strand cDNA Synthesis Kit', 'Servicebio SYBR Green qPCR Master Mix']
11198784: ['Ficoll-Paque Premium density gradient medium', 'Invitrogen trypan blue dye', 'Bio-Rad T

## Mistral

In [8]:
system_prompt = """You are an expert Product Recognition system.
Your task is to accept a text as input and extract the products.
Here is an example of the output format for a list of strings.

Text: I started my day with a fresh cup of coffee (Folgers) brewed from my coffee maker (Keurig), and then I prepared breakfast using my new KitchenAid stand mixer. After breakfast, I grabbed my running shoes (Nike) and went for a jog in the park. On my way back, I stopped by the store to pick up a pack of Tide laundry detergent and a bottle of Coca-Cola. After getting home, I started working on my laptop, which runs on a Microsoft operating system.
Answer: ["Folgers coffee", "Keurig coffee maker", "KitchenAid stand mixer", "Nike running shoes", "Tide laundry detergent", "Coca-Cola", "Microsoft operating system"]

Only use this output format. Do not return anything besides this output format.
Output products in the order they occur in the input text.
"""

user_prompt = """Q: Given the text below, identify the products.

Text: {}
Answer:
"""

In [9]:
api_key = os.environ["MISTRAL_API_KEY"]

In [10]:
def product_recognition(input_text):
    client = MistralClient(api_key=api_key)
    response = client.chat(
        model="mistral-large-latest",
        temperature=1e-16,
        messages=[
        ChatMessage(role="system", content=system_prompt),
        ChatMessage(role="user", content=user_prompt.format(input_text))
    ]
    )
    return response.choices[0].message.content

In [12]:
for key, value in method_dict.items():
    identified_products = product_recognition(value)
    print(f"{key}: {identified_products}")
        

11198919: []
11198832: ["Shimadzu high-performance liquid chromatography (HPLC)", "AB SCIEX mass spectrometer (MS)", "ABI PCR amplification instrument", "BioTek enzyme labeling instrument", "Beijing Liuyi electrophoresis apparatus", "Beijing Bijing Biotechnology Co., LTD. gel imaging system", "Thermo Fisher Scientific Nanodrop UV quantitative system", "Illumina Sequencer", "Bio-rad Corporation fluorescence quantitative PCR instrument", "vancomycin hydrochloride for injection (VIANEX S.A.)", "bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)", "NBP capsules (Shiyao Group Enbipu Pharmaceutical)", "NBP reference (Shiyao Group Enbipu Pharmaceutical)", "glipizide reference (Sichuan Vicchi Biochemical Technology)", "Omega Bio-Tek Soil DNA Kit", "ABI Quant-iT PicoGreen dsDNA Assay Kit", "Servicebio First Strand cDNA Synthesis Kit", "Servicebio SYBR Green qPCR Master Mix"]
11198784: []
11198029: []
11197919: ["HI-MAIZE® 260 starch", "VERSAFIBE™ 1490 starch",

In [13]:
product_dict = {
    "11198919": [],
    "11198832": ["Shimadzu high-performance liquid chromatography (HPLC)", "AB SCIEX mass spectrometer (MS)", "ABI PCR amplification instrument", "BioTek enzyme labeling instrument", "Beijing Liuyi electrophoresis apparatus", "Beijing Bijing Biotechnology Co., LTD. gel imaging system", "Thermo Fisher Scientific Nanodrop UV quantitative system", "Illumina Sequencer", "Bio-rad Corporation fluorescence quantitative PCR instrument", "vancomycin hydrochloride for injection (VIANEX S.A.)", "bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)", "NBP capsules (Shiyao Group Enbipu Pharmaceutical)", "NBP reference (Shiyao Group Enbipu Pharmaceutical)", "glipizide reference (Sichuan Vicchi Biochemical Technology)", "Omega Bio-Tek Soil DNA Kit", "ABI Quant-iT PicoGreen dsDNA Assay Kit", "Servicebio First Strand cDNA Synthesis Kit", "Servicebio SYBR Green qPCR Master Mix"],
    "11198784": [],
    "11198029": [],
    "11197919": ["HI-MAIZE® 260 starch", "VERSAFIBE™ 1490 starch", "AMIOCA™ TF starch", "Tanita SC-240 Total Body Composition Analyzer", "Automated Self-Administered 24-hour (ASA24) Dietary Assessment Tool 2020", "QIAamp 96 DNA Blood Kit", "QIAamp Blood Mini Kit", "QIAamp Investigator Kit", "iTaq™ Universal SYBR® Green Supermix", "Salimetrics Salivary Alpha-Amylase Enzymatic Kit", "DNeasy PowerSoil 96 HTP Kit", "Classic++™ Hot Start Taq DNA Polymerase Master Mix", "Mag-Bind TotalPure", "QIIME 2", "DADA2", "mafft", "fasttree", "Greengenes 13_8 99% OTUs reference sequences", "RStudio version 4.2.1", "ComplexHeatmap", "circlize", "MaAsLin2 (Microbiome Multivariable Association with Linear Models 2)", "glmnet"],
    "11197476": ["CXB@Lipo (C@L)", "chloroform", "HSPC", "LiposoFast (Avestin, Canada)", "polycarbonate membrane filters (Merck Millipore Ltd., USA)", "ultrafiltration centrifuge tube", "ECN", "glycol chitosan", "transmission electron microscopy (TEM; JEM-1400, Japan)", "Malvern Nano ZS ZEN3600 (Malvern, UK)", "UV-VIS (Thermo Fisher Scientific, USA)", "ICG@ECN (I@ECN)", "flow cytometry (CytoFLEX, Beckman, USA)", "UV‒VIS", "pGEX-Pvhb-Lysis", "cloning kit (Beyotime Biotechnology Co. Ltd., Shanghai, China)", "ICG@Liposome (I@L)", "confocal laser scanning microscopy (CLSM) (Nikon, Japan)", "CCK-8 assay kit (Beyotime Biotechnology Co. Ltd., Shanghai, China)", "Western blot (WB) analysis", "FITC-conjugated anti-COX-2 antibody (Affinity, Jiangsu, China)", "in vivo images system (IVIS)", "optimal cutting temperature (OCT) compound", "CT26 tumor model", "FITC-conjugated anti-mouse CD3", "FITC-conjugated anti-mouse CD4", "PE-conjugated anti-mouse CD8 (Biolegend Inc., USA)", "TUNEL and H&E staining", "Ki67", "C@ECN-PL", "aspartate aminotransferase (AST)", "alanine aminotransferase (ALT)", "blood urea nitrogen (BUN)", "creatinine (CRE)", "assay kits (Jiancheng Bioengineering Institute, Nanjing, China)", "H&E staining analysis", "ImageJ 6.0"],
    "11197378": ["EXCEL", "IBM SPSS 20", "R language"],
    "11197282": [],
    "11197204": ["Gene Expression Omnibus (GEO)", "quantitative PCR method", "UK National Research Ethics Committee (NRES 16/SS/0172)", "BH false discovery rate correction", "TRIzol (Life Technologies)", "TaqMan™ Gene Expression Assay (FAM) from Life Technologies Ltd", "Easy Oligos from Merck Life Science UK Ltd", "iTaq Universal One-Step or iTaq™ Universal SYBR® Green One-Step Kit from BioRad", "GraphPad PRISM", "ShinyGO", "Enrichr", "Tools"],
    "11197185": ["ZIKV", "Vero cell culture (ATCC)", "C636 cell culture (ATCC)", "Tetra Pond Koi growth feed", "defibrinated sheep blood", "sodium bicarbonate", "Hemotek membrane feeding system (Discovery Workshops, Accrington, UK)", "triethylamine (Sigma Aldrich, St. Louis, MO, USA)", "filter sterilized fetal bovine serum", "Mosquito Diluent [MD]", "penicillin/streptomycin", "gentamicin", "Fungizone (Sigma Aldrich, St. Louis, MO, USA)", "phosphate-buffered Saline (PBS) (Sigma Aldrich, St. Louis, MO, USA)", "Illumina MiSeq", "REPLI-g whole transcriptome amplification (WTA) single-cell kit (Qiagen, Hilden, Germany)", "Illumina barcoded 16S primer set", "AccuStart II PCR supermix (Quantabio, Beverly, MA, USA)", "QIITA", "MicrobiomeAnalyst", "GreenGenes 16S reference database", "BIOM-formatted OTU table", "NCBI GenBank Short Read Archives"],
    "11196966": ["Advanced DMEM-F12 (Gibco/Thermo Fisher Scientific)", "penicillin (Gibco/Thermo Fisher Scientific)", "streptomycin (Gibco/Thermo Fisher Scientific)", "HEPES (Invitrogen/Thermo Fisher Scientific)", "Glutamax (Invitrogen/Thermo Fisher Scientific)", "N2 (Gibco/Thermo Fisher Scientific)", "B27 (Gibco/Thermo Fisher Scientific)", "N-acetylcysteine (Sigma-Aldrich)", "hygromycin (Invitrogen/Thermo Fisher Scientific)", "puromycin (InvivoGen)", "Cultrex RGF basement membrane extract (BME) Type 2 (R&D Systems)", "cell counter TC20 (BioRad)", "collagen I/5x neutralization buffer", "buprenorphine (Temgesic, Schering-Plough)", "Isoflurane Piramal (Piramal)", "SomnoSuite® Low-Flow Anesthesia System (Kent Scientific Corporation)", "Seprafilm (Baxter)", "6-0 Vicryl (Ethicon/Johnson&Johnson) suture", "9.4 tesla small bore animal scanner (BioSpec 94/21, Bruker Biospin)", "dedicated mouse quadrature-resonator (Bruker)", "VEVO 3100 (Fujifilm/VisualSonics) ultrasound", "small animal transducer", "RNAprotect buffer (Qiagen, Netherlands)", "Liver Dissociation Kit, mouse (Miltenyi Biotec)", "Tumor Dissociation Kit, mouse (Miltenyi Biotec)", "ACK lysis buffer (Gibco/Thermo Fisher Scientific)", "TruStain FcX PLUS (anti-mouse CD16/32) antibody (BioLegend)", "FACS buffer (PBS + 0,5% BSA)", "1x Fixation/Permeabilization buffer", "eBioscience™ Foxp3/Transcription Factor Staining Buffer Set (Invitrogen/Thermo Fisher Scientific)", "FACS antibodies (BioLegend)", "Beckman Coulter Gallios (Beckman Coulter)", "BD LSRFortessa (Becton Dickinson)", "Kaluza Software (Beckman Coulter)", "RPMI-Medium (+ 1% Pen/Strep + 1% HEPES + 5 µl beta-mercaptoethanol)", "Cell Activation Cocktail (BioLegend)", "Brefeldin A (BioLegend)", "RLT buffer (Qiagen, Venlo, Netherlands)", "RNeasy Mini Kit (Qiagen)", "NanoDrop™ spectrophotometer (Thermo Fisher Scientific)", "ProtoScript II First Strand cDNA Synthesis Kit (New England Biolabs)", "QuantiTect SYBR Green RT PCR mastermix (Qiagen)", "DNeasy Blood & Tissue Kit (Qiagen)", "primer sequences (purchased from Eurofins Genomics)", "Roche LightCycler 480 (Roche)", "QuantiFast SYBR® Green PCR Kit (Qiagen)", "Bifidobacterium actinocoloniiforme (DSMZ-Deutsche Sammlung von Mikroorganismen und Zellkulturen GmbH, DSM22766)"],
    "11196847": ["Potato dextrose agar (PDA) medium", "Potato dextrose broth (PDB)", "ABI 3130 Genetic Analyzer", "Taq DNA polymerase Assay Buffer", "ABI 3130xl gel sequencing", "Phylogenetic Tree Builder", "Ethanol (80%, v/v)", "Whatman paper No. 4", "Lowry's assay", "Folin-Ciocalteu reagent", "Bovine Serum Albumin (BSA)", "Anthrone reagent", "Anthrone technique", "Quercetin", "Folin-Ciocalteu method", "Gallic acid equivalent (GAE)", "Aescin", "Ferric chloride", "Mg(CH3COO)2 in CH3OH", "Rhein", "NH4OH", "Liebermann-Burchard reagent", "DPPH radicals", "Liebermann-Burchard reagent", "β-glucan assay kit (Megazyme Int., Dublin, Ireland)", "High-performance liquid chromatography (PDA)", "RP C-18 column", "Quercetin", "Sesquiterpene glycosides", "Ergosterol", "RidentinB", "Statistical Package for the Social Sciences (SPSS)"],
    "11196703": ["ENSO 16 sugar substitute", "glucose in water solution", "NEOH by Alpha Republic GmbH", "STAMAG Stadlauer Malzfabrik GesmbH", "Special Ingredients Ltd glucose powder", "GraphPad Prism 9"],
    "11196440": [],
    "11196417": ["GEO database", "limma package", "R software", "ggplot2 package", "GSEA official website", "MSigDB database", "clusterProfiler package", "enrichplot package", "ImmuCellAI", "ssGSEA", "STRING 4 online platform", "Cytoscape V3.9.0 software", "CytoHubba plugin", "Random Forest algorithm", "SVM-RFE", "LASSO model", "Logistic regression", "Receiver Operating Characteristic (ROC) curve", "Area Under the Curve (AUC)", "miRNet database", "Drug-Gene Interaction database (DGIdb)", "ggplot2 package", "TransZol Up Plus RNA Kit", "Nanodrop Spectrophotometer", "TransScript® One-Step gDNA Removal and cDNA Synthesis SuperMix", "ChamQ Universal SYBR qPCR Master Mix", "QuantStudio™ 5 Real-Time PCR System", "Image J software"],
    "11196393": ["Ficoll gradient", "ACK lysing buffer", "RPMI1640 supplemented with 20% FBS", "BD FACSAria III cell sorter", "Fc-block (Miltenyi)", "surface antibodies against CD11b, HLA-DR, CD14, CD177, and CXCR2", "anti-Arg1 (A1exF5; Thermo Fisher)", "DCFDA (Abcam)", "BD FACSCanto II", "APC antihuman CD11b(ICRF44)", "Brilliant Violet 605™ antihuman HLA-DR(L243)", "APC/Cyanine7 antihuman CD14(63D3)", "PerCP/Cyanine5.5 antihuman CXCR2(5E8/CXCR2)", "FITC antihuman CD177(MEM-166)", "Cell Ranger (version 7.0)", "Seurat (version 4.0.1)", "monocle2 (version 2.26.0)", "GraphPad Prism 8.0"],
    "11196377": ["Cobb-500 chicks", "Evonik Co. near-infrared analysis", "UFFDA software", "Cobb-Vantress commercial management guide", "Vitamin premix", "Mineral premix", "SAS 9.4 software"],
    "11196265": ["Polyethylene microplastics (PE-MPs) powder", "Purified water (Milli-Q)", "Tadalafil (Cilais) substance", "Sildenafil", "Commercial fish pellet feed (containing 30% protein and 12% lipid; El Nasr Company, Egypt)", "Automated technical analyzer (BC-2800 from Mindray)", "Spectrophotometer (in a wavelength range of 340-546 nm; Biodiagonstic Company, Egypt)", "Kits for TAC (total antioxidant capacity) determination (Sigma-Aldrich, USA)", "Olympus CH30 microscope", "IL-6 antibody (E-AB-40021, Elabscience Biotechnology Inc, USA)", "Streptavidin-biotin-peroxidase pack", "Mayer's hematoxylin", "SPSS 16.0 program for Windows (SPSS 2007, Inc., IL, USA)", "Canon digital camera (PowershotA95)"],
    "11196252": ["AUTOLAB-PGSTAT302N instrument", "NOVA 1.11.1 software", "platinum (Pt) wires", "silver/silver chloride wires (Ag/AgCl)", "EmStat3 potentiostat", "PSTrace software", "automated electrode stand (M164, MTM Anko Instruments)", "glassy carbon electrode (GCE)", "silver/silver chloride electrode (Ag/AgCl/3.0 mol L−1 KCl)", "platinum wire (Pt, 99.99%, The Mint of Poland)", "DANO (Merck)", "hydrochloric acid (HCl)", "Potassium tetrakis(4-chlorophenyl)borate (KTPBCl, Merck)", "bis(triphenylphosphoranylidene) ammonium chloride (BTPPACl, Merck)", "bis(triphenylphosphoranylidene)ammonium tetrakis(4-chlorophenyl)borate (BTPPATPBCl)", "Tetrapropylammonium chloride (TPrACl, Alfa Aesar)", "Britton-Robinson buffers (BRBs)", "1,2-dichloroethane (1,2-DCE, POCH)", "citric acid (Sigma-Aldrich)", "galactose (Chempur)", "lactose (Fisher Chemical)", "glucose (Alfa Aesar)", "calcium chloride (Sigma-Aldrich)", "potassium chloride (Chempur)", "magnesium chloride (Fisher Chemical)", "iron(III) chloride (Alfa Aesar)", "sodium lactate (Sigma-Aldrich)", "orthophosphoric acid (V) (Alfa Aesar)", "combined pH electrode (Polilyte Lab, Hamilton, Switzerland)"]
}

In [14]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu high-performance liquid chromatography (HPLC)', 'AB SCIEX mass spectrometer (MS)', 'ABI PCR amplification instrument', 'BioTek enzyme labeling instrument', 'Beijing Liuyi electrophoresis apparatus', 'Beijing Bijing Biotechnology Co., LTD. gel imaging system', 'Thermo Fisher Scientific Nanodrop UV quantitative system', 'Illumina Sequencer', 'Bio-rad Corporation fluorescence quantitative PCR instrument', 'vancomycin hydrochloride for injection (VIANEX S.A.)', 'bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)', 'NBP capsules (Shiyao Group Enbipu Pharmaceutical)', 'NBP reference (Shiyao Group Enbipu Pharmaceutical)', 'glipizide reference (Sichuan Vicchi Biochemical Technology)', 'Omega Bio-Tek Soil DNA Kit', 'ABI Quant-iT PicoGreen dsDNA Assay Kit', 'Servicebio First Strand cDNA Synthesis Kit', 'Servicebio SYBR Green qPCR Master Mix']
11197919: ['HI-MAIZE® 260 starch', 'VERSAFIBE™ 1490 starch', 'AMIOCA™ TF starch', 'Tanita SC-240 To

In [13]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11197919: ['HI-MAIZE® 260 starch', 'VERSAFIBE™ 1490 starch', 'AMIOCA™ TF starch', 'Tanita SC-240 Total Body Composition Analyzer', 'Automated Self-Administered 24-hour (ASA24) Dietary Assessment Tool 2020', 'QIAamp 96 DNA Blood Kit', 'QIAamp Blood Mini Kit', 'QIAamp Investigator Kit', 'iTaq™ Universal SYBR® Green Supermix', 'Salimetrics Salivary Alpha-Amylase Enzymatic Kit', 'DNeasy PowerSoil 96 HTP Kit', 'Classic++™ Hot Start Taq DNA Polymerase Master Mix', 'Mag-Bind TotalPure', 'QIIME 2', 'DADA2', 'mafft', 'fasttree']
11197476: ['CXB@Lipo', 'chloroform', 'rotary evaporator', 'liposome extrude (LiposoFast, Avestin, Canada)', 'polycarbonate membrane filters (Merck Millipore Ltd., USA)', 'ultrafiltration centrifuge tube', 'ECN', 'glycol chitosan', 'transmission electron microscopy (TEM; JEM-1400, Japan)', 'Malvern Nano ZS ZEN3600 (Malvern, UK)', 'UV-VIS (Thermo Fisher Scientific, USA)', 'ICG@ECN (I@ECN)', 'flow cytometry (CytoFLEX, Beckman, USA)', 'UV‒VIS', 'pGEX-Pvhb-Lysis', 'cloning k

# Company Identification (New)

## OpenAI

In [3]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each mentioned product.
If the strings do not contain company names, use your internal knowledge to determine the companies.
For any product where the manufacturer is unclear, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["Sony PlayStation 5", "iPhone 15 Plus", "Staples Pencil", "Stainless Steel Tumbler", "Car", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", "Staples", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [4]:
product_dict = {
    "11198832": ['Shimadzu high-performance liquid chromatography (HPLC)', 'SCIEX mass spectrometer (MS)', 'ABI PCR amplification instrument', 'BioTek enzyme labeling instrument', 'Beijing Liuyi Instrument Factory electrophoresis apparatus', 'Beijing Bijing Biotechnology Co. gel imaging system', 'Thermo Fisher Scientific Nanodrop UV quantitative system', 'Illumina Sequencer', 'Bio-rad Corporation fluorescence quantitative PCR instrument', 'VIANEX S.A. vancomycin hydrochloride for injection', 'Hangzhou Yuanda Biological Pharmaceutical bifidobacterium quadruplex viable tablets', 'Shiyao Group Enbipu Pharmaceutical NBP capsules', 'Shiyao Group Enbipu Pharmaceutical NBP reference', 'Sichuan Vicchi Biochemical Technology glipizide reference', 'Omega Bio-Tek Soil DNA Kit', 'ABI Quant-iT PicoGreen dsDNA Assay Kit', 'Servicebio First Strand cDNA Synthesis Kit', 'Servicebio SYBR Green qPCR Master Mix'],
    "11198784": ['Ficoll-Paque Premium density gradient medium', 'Invitrogen trypan blue dye', 'Bio-Rad TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Becton Dickinson CD56 APC', 'Becton Dickinson CD3 PE Cy7', 'BD Bioscience stain buffer', 'BD LSR-Fortessa X-20 flow cytometer', 'Harvard Apparatus borosilicate glass capillaries', 'Sutter Instrumental P-97 glass pipette', 'Narishige Micro Forge MF-900', 'Molecular Devices CV203BU head-stage', 'Narishige manipulators', 'Molecular Devices Axopatch 200B amplifier', 'Molecular Devices pClamp 10.7 software', 'Molecular Devices Digidata 1440A', 'Sigma-Aldrich membrane filter', 'Sapphire Bioscience Reagents ATP', 'Sapphire Bioscience Reagents GTP', 'Tocris Bioscience PregS', 'Tocris Bioscience Ononetin', 'Sigma-Aldrich chemicals', 'IBM SPSS software', 'GraphPad Prism software', 'OriginLab Origin 2021 software'],
    "11197919": ['HI-MAIZE® 260 starch', 'VERSAFIBE™ 1490 starch', 'AMIOCA™ TF starch', 'Salimetrics Salivary Alpha-Amylase Enzymatic Kit'],
    "11197476": ['LiposoFast liposome extruder', 'Merck Millipore polycarbonate membrane filters', 'ultrafiltration centrifuge tube', 'JEM-1400 transmission electron microscope', 'Malvern Nano ZS ZEN3600', 'UV-VIS Thermo Fisher Scientific', 'CytoFLEX flow cytometer', 'Beyotime Biotechnology cloning kit', 'Nikon confocal laser scanning microscopy', 'CCK-8 assay kit Beyotime Biotechnology', 'IVIS in vivo imaging system', 'Biolegend Inc. antibodies', 'Jiancheng Bioengineering Institute assay kits'],
    "11197378": ['Qingyue Open Environmental Data Centre data', 'Gaode Map', 'EXCEL database', 'IBM SPSS 20 software', 'R language'],
    "11197282": ['Metabolon untargeted metabolomics profiling'],
    "11197204": ['TRIzol', 'TaqMan Gene Expression Assay', 'iTaq Universal One-Step Kit', 'iTaq Universal SYBR Green One-Step Kit', 'BioRad', 'GraphPad PRISM', 'Enrichr', 'ShinyGO'],
    "11197185": ['Illumina MiSeq platform', 'Vero cell culture (ATCC)', 'C636 cell culture (ATCC)', 'Sterilite container', 'Tetra Pond Koi growth feed', 'Hemotek membrane feeding system', 'Sigma Aldrich triethylamine', 'Trizol (Thermo Fisher Scientific)', 'REPLI-g WTA single-cell kit (Qiagen)', 'AccuStart II PCR supermix (Quantabio)'],
    "11196966": ['Advanced DMEM-F12 (Gibco/Thermo Fisher Scientific)', 'penicillin/streptomycin (Gibco/Thermo Fisher Scientific)', 'HEPES (Invitrogen/Thermo Fisher Scientific)', 'Glutamax (Invitrogen/Thermo Fisher Scientific)', 'N2 (Gibco/Thermo Fisher Scientific)', 'B27 (Gibco/Thermo Fisher Scientific)', 'N-acetylcysteine (Sigma-Aldrich)', 'hygromycin (Invitrogen/Thermo Fisher Scientific)', 'puromycin (InvivoGen)', 'Cultrex RGF basement membrane extract (R&D Systems)', 'cell counter TC20 (BioRad)', 'buprenorphine (Temgesic, Schering-Plough)', 'Isoflurane (Isoflurane Piramal, Piramal)', 'SomnoSuite® Low-Flow Anesthesia System (Kent Scientific Corporation)', 'Seprafilm (Baxter)', 'Vicryl (Ethicon/Johnson&Johnson)', 'BioSpec 94/21 MRI scanner (Bruker Biospin)', 'VEVO 3100 ultrasound (Fujifilm/VisualSonics)', 'Liver Dissociation Kit, mouse (Miltenyi Biotec)', 'Tumor Dissociation Kit, mouse (Miltenyi Biotec)', 'ACK lysis buffer (Gibco/Thermo Fisher Scientific)', 'TruStain FcX PLUS (BioLegend)', 'eBioscience™ Foxp3/Transcription Factor Staining Buffer Set (Invitrogen/Thermo Fisher Scientific)', 'Beckman Coulter Gallios (Beckman Coulter)', 'BD LSRFortessa (Becton Dickinson)', 'Cell Activation Cocktail (BioLegend)', 'Brefeldin A (BioLegend)', 'RNeasy Mini Kit (Qiagen)', 'ProtoScript II First Strand cDNA Synthesis Kit (New England Biolabs)', 'DNeasy Blood & Tissue Kit (Qiagen)', 'QuantiTect SYBR Green RT PCR mastermix (Qiagen)', 'Roche LightCycler 480 (Roche)', 'Pierce BCA-Kit (Thermo Fisher Scientific)', 'Azure Ao Microplate Reader (Azure Biosystems)', 'Laemmli Buffer (BioRad)', 'Mini-Protean TGX precast gel (BioRad)', 'Immobilon Crescendo Western HRP substrate (Merck Millipore)', 'ChemiDoc BioRad imager (BioRad)', 'Vector® TrueVIEW® Autofluorescence Quenching Kit (Biozol)', 'NucBlue™ Fixed Cell Stain ReadyProbes™ (Thermo Fisher Scientific)', 'VECTASHIELD Vibrance Antifade Mounting Medium (Biozol)', 'Zeiss Axio Scan Z.1 Microscope Slide Scanner (Zeiss)', 'GraphPad Prism'],
    "11196847": ['ABI 3130 Genetic Analyzer', 'Megazyme mushroom and yeast β-glucan assay kit'],
    "11196703": ['ENSO 16 sugar substitute', 'glucose powder'],
    "11196417": ['GEO database', 'limma package', 'ggplot2 package', 'clusterProfiler package', 'enrichplot package', 'ImmuCellAI methodology', 'ssGSEA methodology', 'Mantel algorithm', 'STRING 4 online platform', 'Cytoscape V3.9.0 software', 'CytoHubba plugin', 'Random Forest package', 'SVM-RFE approach', 'LASSO model', 'Logistic regression approach', 'ROC curve', 'AUC', 'Limma package', 'miRNet database', 'DGIdb database', 'TransZol Up Plus RNA Kit', 'Nanodrop Spectrophotometer', 'TransScript® One-Step gDNA Removal and cDNA Synthesis SuperMix', 'ChamQ Universal SYBR qPCR Master Mix', 'QuantStudio™ 5 Real-Time PCR System', 'Image J software'],
    "11196393": ['Ficoll gradient', 'ACK lysing buffer', 'BD FACSAria III cell sorter', 'Fc-block (Miltenyi)', 'A1exF5 antibody (Thermo Fisher)', 'DCFDA (Abcam)', 'APC antihuman CD11b(ICRF44) antibody (BioLegend)', 'Brilliant Violet 605™ antihuman HLA-DR(L243) antibody (BioLegend)', 'APC/Cyanine7 antihuman CD14(63D3) antibody (BioLegend)', 'PerCP/Cyanine5.5 antihuman CXCR2(5E8/CXCR2) antibody (BioLegend)', 'FITC antihuman CD177(MEM-166) antibody (BioLegend)', 'Cell Ranger software', 'Seurat toolkit', 'monocle2 package', 'GraphPad Prism software'],
    "11196377": ['Cobb-500 broiler chicks', 'Evonik Nutrition & Care GmbH near-infrared analysis', 'Cobb-Vantress finisher diet', 'UFFDA software', 'A&D Weighing digital weighing scale', 'Konica Minolta Chroma-Meter CR-410', 'SAS 9.4 software'],
    "11196265": ['Polyethylene microplastics (PE-MPs) powder', 'Milli-Q purified water', 'tadalafil (Cilais)', 'commercial fish pellet feed (El Nasr Company)', 'BC-2800 automated technical analyzer (Mindray)', 'spectrophotometer (Biodiagonstic Company)', 'Sigma-Aldrich total antioxidant capacity kits', 'Olympus CH30 microscope', 'Vector Laboratories standard goat serum', 'E-AB-40021 IL-6 antibody (Elabscience Biotechnology Inc)', 'Olympus BX41 microscope', 'Canon digital camera (PowershotA95)', 'SPSS 16.0 program (SPSS 2007, Inc.)'],
    "11196252": ['Metrohm Autolab AUTOLAB-PGSTAT302N instrument', 'NOVA 1.11.1 software', 'EmStat3 potentiostat', 'PSTrace software', 'PalmSens B.V.', 'automated electrode stand M164', 'MTM Anko Instruments', 'Basi® glassy carbon electrode', 'Merck DANO', 'Potassium tetrakis(4-chlorophenyl)borate KTPBCl', 'bis(triphenylphosphoranylidene) ammonium chloride BTPPACl', 'Tetrapropylammonium chloride TPrACl', 'Alfa Aesar', 'POCH Britton-Robinson buffers', '1,2-dichloroethane 1,2-DCE', 'Sigma-Aldrich', 'Chempur', 'Fisher Chemical', 'Polilyte Lab pH electrode', 'Hamilton']
}

In [5]:
def company_identification(list_of_products):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(list_of_products)}
        ]
    )
    return response.choices[0].message.content

In [6]:
company_dict = {}
for key, value in product_dict.items():
    syntax_error = False
    if value:
        identified_companies = company_identification(value)
        try:
            identified_companies = ast.literal_eval(identified_companies)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            company_dict[key] = identified_companies

In [7]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu', 'SCIEX', 'ABI', 'BioTek', 'Beijing Liuyi Instrument Factory', 'Beijing Bijing Biotechnology Co.', 'Thermo Fisher Scientific', 'Illumina', 'Bio-rad Corporation', 'VIANEX S.A.', 'Hangzhou Yuanda Biological Pharmaceutical', 'Shiyao Group', 'Shiyao Group', 'Sichuan Vicchi Biochemical Technology', 'Omega Bio-Tek', 'ABI', 'Servicebio', 'Servicebio']
11198784: ['Cytiva', 'Thermo Fisher Scientific', 'Bio-Rad Laboratories', 'STEMCELL Technologies', 'Becton Dickinson', 'Becton Dickinson', 'BD Biosciences', 'BD Biosciences', 'Harvard Apparatus', 'Sutter Instrument', 'Narishige', 'Molecular Devices', 'Narishige', 'Molecular Devices', 'Molecular Devices', 'Molecular Devices', 'Merck (Sigma-Aldrich)', 'Sapphire Bioscience', 'Sapphire Bioscience', 'Tocris Bioscience', 'Tocris Bioscience', 'Merck (Sigma-Aldrich)', 'IBM', 'GraphPad Software', 'OriginLab']
11197919: ['Ingredion', 'Ingredion', 'Ingredion', 'Salimetrics']
11197476: ['Harvard Apparatus', 'Merck', None, 'JEOL', 'Malve

In [8]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu high-performance liquid chromatography (HPLC)', 'SCIEX mass spectrometer (MS)', 'ABI PCR amplification instrument', 'BioTek enzyme labeling instrument', 'Beijing Liuyi Instrument Factory electrophoresis apparatus', 'Beijing Bijing Biotechnology Co. gel imaging system', 'Thermo Fisher Scientific Nanodrop UV quantitative system', 'Illumina Sequencer', 'Bio-rad Corporation fluorescence quantitative PCR instrument', 'VIANEX S.A. vancomycin hydrochloride for injection', 'Hangzhou Yuanda Biological Pharmaceutical bifidobacterium quadruplex viable tablets', 'Shiyao Group Enbipu Pharmaceutical NBP capsules', 'Shiyao Group Enbipu Pharmaceutical NBP reference', 'Sichuan Vicchi Biochemical Technology glipizide reference', 'Omega Bio-Tek Soil DNA Kit', 'ABI Quant-iT PicoGreen dsDNA Assay Kit', 'Servicebio First Strand cDNA Synthesis Kit', 'Servicebio SYBR Green qPCR Master Mix']
11198784: ['Ficoll-Paque Premium density gradient medium', 'Invitrogen trypan blue dye', 'Bio-Rad T

## Mistral

In [9]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each product mentioned.
Strings provided in the list can be misleading; use your internal knowledge to identify company names.
If the manufacturer is unclear for any product, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["PlayStation 5", "iPhone 15 Plus", "Pencil", "Stainless Steel Tumbler", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [10]:
api_key = os.environ["MISTRAL_API_KEY"]

In [11]:
def company_identification(input_text):
    client = MistralClient(api_key=api_key)
    response = client.chat(
        model="mistral-large-latest",
        temperature=1e-16,
        messages=[
        ChatMessage(role="system", content=system_prompt),
        ChatMessage(role="user", content=user_prompt.format(input_text))
    ]
    )
    return response.choices[0].message.content

In [12]:
product_dict = {
    "11198919": [],
    "11198832": ["Shimadzu high-performance liquid chromatography (HPLC)", "AB SCIEX mass spectrometer (MS)", "ABI PCR amplification instrument", "BioTek enzyme labeling instrument", "Beijing Liuyi electrophoresis apparatus", "Beijing Bijing Biotechnology Co., LTD. gel imaging system", "Thermo Fisher Scientific Nanodrop UV quantitative system", "Illumina Sequencer", "Bio-rad Corporation fluorescence quantitative PCR instrument", "vancomycin hydrochloride for injection (VIANEX S.A.)", "bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)", "NBP capsules (Shiyao Group Enbipu Pharmaceutical)", "NBP reference (Shiyao Group Enbipu Pharmaceutical)", "glipizide reference (Sichuan Vicchi Biochemical Technology)", "Omega Bio-Tek Soil DNA Kit", "ABI Quant-iT PicoGreen dsDNA Assay Kit", "Servicebio First Strand cDNA Synthesis Kit", "Servicebio SYBR Green qPCR Master Mix"],
    "11198784": [],
    "11198029": [],
    "11197919": ["HI-MAIZE® 260 starch", "VERSAFIBE™ 1490 starch", "AMIOCA™ TF starch", "Tanita SC-240 Total Body Composition Analyzer", "Automated Self-Administered 24-hour (ASA24) Dietary Assessment Tool 2020", "QIAamp 96 DNA Blood Kit", "QIAamp Blood Mini Kit", "QIAamp Investigator Kit", "iTaq™ Universal SYBR® Green Supermix", "Salimetrics Salivary Alpha-Amylase Enzymatic Kit", "DNeasy PowerSoil 96 HTP Kit", "Classic++™ Hot Start Taq DNA Polymerase Master Mix", "Mag-Bind TotalPure", "QIIME 2", "DADA2", "mafft", "fasttree", "Greengenes 13_8 99% OTUs reference sequences", "RStudio version 4.2.1", "ComplexHeatmap", "circlize", "MaAsLin2 (Microbiome Multivariable Association with Linear Models 2)", "glmnet"],
    "11197476": ["CXB@Lipo (C@L)", "chloroform", "HSPC", "LiposoFast (Avestin, Canada)", "polycarbonate membrane filters (Merck Millipore Ltd., USA)", "ultrafiltration centrifuge tube", "ECN", "glycol chitosan", "transmission electron microscopy (TEM; JEM-1400, Japan)", "Malvern Nano ZS ZEN3600 (Malvern, UK)", "UV-VIS (Thermo Fisher Scientific, USA)", "ICG@ECN (I@ECN)", "flow cytometry (CytoFLEX, Beckman, USA)", "UV‒VIS", "pGEX-Pvhb-Lysis", "cloning kit (Beyotime Biotechnology Co. Ltd., Shanghai, China)", "ICG@Liposome (I@L)", "confocal laser scanning microscopy (CLSM) (Nikon, Japan)", "CCK-8 assay kit (Beyotime Biotechnology Co. Ltd., Shanghai, China)", "Western blot (WB) analysis", "FITC-conjugated anti-COX-2 antibody (Affinity, Jiangsu, China)", "in vivo images system (IVIS)", "optimal cutting temperature (OCT) compound", "CT26 tumor model", "FITC-conjugated anti-mouse CD3", "FITC-conjugated anti-mouse CD4", "PE-conjugated anti-mouse CD8 (Biolegend Inc., USA)", "TUNEL and H&E staining", "Ki67", "C@ECN-PL", "aspartate aminotransferase (AST)", "alanine aminotransferase (ALT)", "blood urea nitrogen (BUN)", "creatinine (CRE)", "assay kits (Jiancheng Bioengineering Institute, Nanjing, China)", "H&E staining analysis", "ImageJ 6.0"],
    "11197378": ["EXCEL", "IBM SPSS 20", "R language"],
    "11197282": [],
    "11197204": ["Gene Expression Omnibus (GEO)", "quantitative PCR method", "UK National Research Ethics Committee (NRES 16/SS/0172)", "BH false discovery rate correction", "TRIzol (Life Technologies)", "TaqMan™ Gene Expression Assay (FAM) from Life Technologies Ltd", "Easy Oligos from Merck Life Science UK Ltd", "iTaq Universal One-Step or iTaq™ Universal SYBR® Green One-Step Kit from BioRad", "GraphPad PRISM", "ShinyGO", "Enrichr", "Tools"],
    "11197185": ["ZIKV", "Vero cell culture (ATCC)", "C636 cell culture (ATCC)", "Tetra Pond Koi growth feed", "defibrinated sheep blood", "sodium bicarbonate", "Hemotek membrane feeding system (Discovery Workshops, Accrington, UK)", "triethylamine (Sigma Aldrich, St. Louis, MO, USA)", "filter sterilized fetal bovine serum", "Mosquito Diluent [MD]", "penicillin/streptomycin", "gentamicin", "Fungizone (Sigma Aldrich, St. Louis, MO, USA)", "phosphate-buffered Saline (PBS) (Sigma Aldrich, St. Louis, MO, USA)", "Illumina MiSeq", "REPLI-g whole transcriptome amplification (WTA) single-cell kit (Qiagen, Hilden, Germany)", "Illumina barcoded 16S primer set", "AccuStart II PCR supermix (Quantabio, Beverly, MA, USA)", "QIITA", "MicrobiomeAnalyst", "GreenGenes 16S reference database", "BIOM-formatted OTU table", "NCBI GenBank Short Read Archives"],
    "11196966": ["Advanced DMEM-F12 (Gibco/Thermo Fisher Scientific)", "penicillin (Gibco/Thermo Fisher Scientific)", "streptomycin (Gibco/Thermo Fisher Scientific)", "HEPES (Invitrogen/Thermo Fisher Scientific)", "Glutamax (Invitrogen/Thermo Fisher Scientific)", "N2 (Gibco/Thermo Fisher Scientific)", "B27 (Gibco/Thermo Fisher Scientific)", "N-acetylcysteine (Sigma-Aldrich)", "hygromycin (Invitrogen/Thermo Fisher Scientific)", "puromycin (InvivoGen)", "Cultrex RGF basement membrane extract (BME) Type 2 (R&D Systems)", "cell counter TC20 (BioRad)", "collagen I/5x neutralization buffer", "buprenorphine (Temgesic, Schering-Plough)", "Isoflurane Piramal (Piramal)", "SomnoSuite® Low-Flow Anesthesia System (Kent Scientific Corporation)", "Seprafilm (Baxter)", "6-0 Vicryl (Ethicon/Johnson&Johnson) suture", "9.4 tesla small bore animal scanner (BioSpec 94/21, Bruker Biospin)", "dedicated mouse quadrature-resonator (Bruker)", "VEVO 3100 (Fujifilm/VisualSonics) ultrasound", "small animal transducer", "RNAprotect buffer (Qiagen, Netherlands)", "Liver Dissociation Kit, mouse (Miltenyi Biotec)", "Tumor Dissociation Kit, mouse (Miltenyi Biotec)", "ACK lysis buffer (Gibco/Thermo Fisher Scientific)", "TruStain FcX PLUS (anti-mouse CD16/32) antibody (BioLegend)", "FACS buffer (PBS + 0,5% BSA)", "1x Fixation/Permeabilization buffer", "eBioscience™ Foxp3/Transcription Factor Staining Buffer Set (Invitrogen/Thermo Fisher Scientific)", "FACS antibodies (BioLegend)", "Beckman Coulter Gallios (Beckman Coulter)", "BD LSRFortessa (Becton Dickinson)", "Kaluza Software (Beckman Coulter)", "RPMI-Medium (+ 1% Pen/Strep + 1% HEPES + 5 µl beta-mercaptoethanol)", "Cell Activation Cocktail (BioLegend)", "Brefeldin A (BioLegend)", "RLT buffer (Qiagen, Venlo, Netherlands)", "RNeasy Mini Kit (Qiagen)", "NanoDrop™ spectrophotometer (Thermo Fisher Scientific)", "ProtoScript II First Strand cDNA Synthesis Kit (New England Biolabs)", "QuantiTect SYBR Green RT PCR mastermix (Qiagen)", "DNeasy Blood & Tissue Kit (Qiagen)", "primer sequences (purchased from Eurofins Genomics)", "Roche LightCycler 480 (Roche)", "QuantiFast SYBR® Green PCR Kit (Qiagen)", "Bifidobacterium actinocoloniiforme (DSMZ-Deutsche Sammlung von Mikroorganismen und Zellkulturen GmbH, DSM22766)"],
    "11196847": ["Potato dextrose agar (PDA) medium", "Potato dextrose broth (PDB)", "ABI 3130 Genetic Analyzer", "Taq DNA polymerase Assay Buffer", "ABI 3130xl gel sequencing", "Phylogenetic Tree Builder", "Ethanol (80%, v/v)", "Whatman paper No. 4", "Lowry's assay", "Folin-Ciocalteu reagent", "Bovine Serum Albumin (BSA)", "Anthrone reagent", "Anthrone technique", "Quercetin", "Folin-Ciocalteu method", "Gallic acid equivalent (GAE)", "Aescin", "Ferric chloride", "Mg(CH3COO)2 in CH3OH", "Rhein", "NH4OH", "Liebermann-Burchard reagent", "DPPH radicals", "Liebermann-Burchard reagent", "β-glucan assay kit (Megazyme Int., Dublin, Ireland)", "High-performance liquid chromatography (PDA)", "RP C-18 column", "Quercetin", "Sesquiterpene glycosides", "Ergosterol", "RidentinB", "Statistical Package for the Social Sciences (SPSS)"],
    "11196703": ["ENSO 16 sugar substitute", "glucose in water solution", "NEOH by Alpha Republic GmbH", "STAMAG Stadlauer Malzfabrik GesmbH", "Special Ingredients Ltd glucose powder", "GraphPad Prism 9"],
    "11196440": [],
    "11196417": ["GEO database", "limma package", "R software", "ggplot2 package", "GSEA official website", "MSigDB database", "clusterProfiler package", "enrichplot package", "ImmuCellAI", "ssGSEA", "STRING 4 online platform", "Cytoscape V3.9.0 software", "CytoHubba plugin", "Random Forest algorithm", "SVM-RFE", "LASSO model", "Logistic regression", "Receiver Operating Characteristic (ROC) curve", "Area Under the Curve (AUC)", "miRNet database", "Drug-Gene Interaction database (DGIdb)", "ggplot2 package", "TransZol Up Plus RNA Kit", "Nanodrop Spectrophotometer", "TransScript® One-Step gDNA Removal and cDNA Synthesis SuperMix", "ChamQ Universal SYBR qPCR Master Mix", "QuantStudio™ 5 Real-Time PCR System", "Image J software"],
    "11196393": ["Ficoll gradient", "ACK lysing buffer", "RPMI1640 supplemented with 20% FBS", "BD FACSAria III cell sorter", "Fc-block (Miltenyi)", "surface antibodies against CD11b, HLA-DR, CD14, CD177, and CXCR2", "anti-Arg1 (A1exF5; Thermo Fisher)", "DCFDA (Abcam)", "BD FACSCanto II", "APC antihuman CD11b(ICRF44)", "Brilliant Violet 605™ antihuman HLA-DR(L243)", "APC/Cyanine7 antihuman CD14(63D3)", "PerCP/Cyanine5.5 antihuman CXCR2(5E8/CXCR2)", "FITC antihuman CD177(MEM-166)", "Cell Ranger (version 7.0)", "Seurat (version 4.0.1)", "monocle2 (version 2.26.0)", "GraphPad Prism 8.0"],
    "11196377": ["Cobb-500 chicks", "Evonik Co. near-infrared analysis", "UFFDA software", "Cobb-Vantress commercial management guide", "Vitamin premix", "Mineral premix", "SAS 9.4 software"],
    "11196265": ["Polyethylene microplastics (PE-MPs) powder", "Purified water (Milli-Q)", "Tadalafil (Cilais) substance", "Sildenafil", "Commercial fish pellet feed (containing 30% protein and 12% lipid; El Nasr Company, Egypt)", "Automated technical analyzer (BC-2800 from Mindray)", "Spectrophotometer (in a wavelength range of 340-546 nm; Biodiagonstic Company, Egypt)", "Kits for TAC (total antioxidant capacity) determination (Sigma-Aldrich, USA)", "Olympus CH30 microscope", "IL-6 antibody (E-AB-40021, Elabscience Biotechnology Inc, USA)", "Streptavidin-biotin-peroxidase pack", "Mayer's hematoxylin", "SPSS 16.0 program for Windows (SPSS 2007, Inc., IL, USA)", "Canon digital camera (PowershotA95)"],
    "11196252": ["AUTOLAB-PGSTAT302N instrument", "NOVA 1.11.1 software", "platinum (Pt) wires", "silver/silver chloride wires (Ag/AgCl)", "EmStat3 potentiostat", "PSTrace software", "automated electrode stand (M164, MTM Anko Instruments)", "glassy carbon electrode (GCE)", "silver/silver chloride electrode (Ag/AgCl/3.0 mol L−1 KCl)", "platinum wire (Pt, 99.99%, The Mint of Poland)", "DANO (Merck)", "hydrochloric acid (HCl)", "Potassium tetrakis(4-chlorophenyl)borate (KTPBCl, Merck)", "bis(triphenylphosphoranylidene) ammonium chloride (BTPPACl, Merck)", "bis(triphenylphosphoranylidene)ammonium tetrakis(4-chlorophenyl)borate (BTPPATPBCl)", "Tetrapropylammonium chloride (TPrACl, Alfa Aesar)", "Britton-Robinson buffers (BRBs)", "1,2-dichloroethane (1,2-DCE, POCH)", "citric acid (Sigma-Aldrich)", "galactose (Chempur)", "lactose (Fisher Chemical)", "glucose (Alfa Aesar)", "calcium chloride (Sigma-Aldrich)", "potassium chloride (Chempur)", "magnesium chloride (Fisher Chemical)", "iron(III) chloride (Alfa Aesar)", "sodium lactate (Sigma-Aldrich)", "orthophosphoric acid (V) (Alfa Aesar)", "combined pH electrode (Polilyte Lab, Hamilton, Switzerland)"]
}

In [13]:
for key, value in product_dict.items():
    identified_companies = company_identification(value)
    print(f"{key}: {identified_companies}")

11198919: []
11198832: ["Shimadzu", "AB SCIEX", "Applied Biosystems (ABI)", "BioTek", None, None, "Thermo Fisher Scientific", "Illumina", "Bio-Rad", "VIANEX S.A.", "Hangzhou Yuanda Biological Pharmaceutical", "Shiyao Group Enbipu Pharmaceutical", "Shiyao Group Enbipu Pharmaceutical", "Sichuan Vicchi Biochemical Technology", "Omega Bio-Tek", "Applied Biosystems (ABI)", None, "Servicebio"]
11198784: []
11198029: []
11197919: ["Ingredion", "Ingredion", "Ingredion", "Tanita", None, "QIAGEN", "QIAGEN", "QIAGEN", "Bio-Rad Laboratories", "Salimetrics", "QIAGEN", "Bio-Rad Laboratories", "Omega Bio-tek", None, None, None, None, None, "RStudio", None, None, None, None]
11197476: ["Avestin", None, None, "Avestin", "Merck Millipore Ltd.", None, None, None, "JEM-1400", "Malvern", "Thermo Fisher Scientific", None, "Beckman", None, None, "Beyotime Biotechnology Co. Ltd.", None, "Nikon", "Beyotime Biotechnology Co. Ltd.", None, "Affinity", None, None, "Biolegend Inc.", None, None, None, None, "Biolege

In [14]:
for key, value in product_dict.items():
    print(f"{key}: {value}")

11198919: []
11198832: ['Shimadzu high-performance liquid chromatography (HPLC)', 'AB SCIEX mass spectrometer (MS)', 'ABI PCR amplification instrument', 'BioTek enzyme labeling instrument', 'Beijing Liuyi electrophoresis apparatus', 'Beijing Bijing Biotechnology Co., LTD. gel imaging system', 'Thermo Fisher Scientific Nanodrop UV quantitative system', 'Illumina Sequencer', 'Bio-rad Corporation fluorescence quantitative PCR instrument', 'vancomycin hydrochloride for injection (VIANEX S.A.)', 'bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)', 'NBP capsules (Shiyao Group Enbipu Pharmaceutical)', 'NBP reference (Shiyao Group Enbipu Pharmaceutical)', 'glipizide reference (Sichuan Vicchi Biochemical Technology)', 'Omega Bio-Tek Soil DNA Kit', 'ABI Quant-iT PicoGreen dsDNA Assay Kit', 'Servicebio First Strand cDNA Synthesis Kit', 'Servicebio SYBR Green qPCR Master Mix']
11198784: []
11198029: []
11197919: ['HI-MAIZE® 260 starch', 'VERSAFIBE™ 1490 starch',

# Extracting Company/Brand Names

## OpenAI

In [7]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each product mentioned.
Strings provided in the list can be misleading; use your internal knowledge to identify company names.
If the manufacturer is unclear for any product, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["PlayStation 5", "iPhone 15 Plus", "Pencil", "Stainless Steel Tumbler", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [8]:
def company_identification(list_of_products):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        # top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(list_of_products)}
        ]
    )
    return response.choices[0].message.content

In [9]:
company_dict = {}
for key, value in product_dict.items():
    syntax_error = False
    if value:
        identified_companies = company_identification(value)
        try:
            identified_companies = ast.literal_eval(identified_companies)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            company_dict[key] = identified_companies

In [10]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Shimadzu', 'SCIEX', 'ABI', 'BioTek', 'Beijing Liuyi Instrument Factory', 'Beijing Bijing Biotechnology Co., LTD.', 'Thermo Fisher Scientific', 'Illumina', 'Bio-rad Corporation', 'VIANEX S.A.', 'Hangzhou Yuanda Biological Pharmaceutical', 'Shiyao Group Enbipu Pharmaceutical', 'Shiyao Group Enbipu Pharmaceutical', 'Sichuan Vicchi Biochemical Technology', 'Omega Bio-Tek', 'ABI', 'Servicebio', 'Servicebio']
11198784: ['Cytiva', None, 'Bio-Rad', 'STEMCELL Technologies', None, None, None, 'BD Biosciences', None, 'Sutter Instrument', 'Narishige', None, 'Molecular Devices', 'Molecular Devices', 'Molecular Devices', None, None, None]
11197919: ['Ingredion', 'Ingredion', 'Ingredion', 'Tanita', None, 'Salimetrics', 'Waters Corporation', 'Qiagen', None, None, 'Qiagen', 'Qiagen', 'Qiagen', None]
11197476: ['Bruker', 'Merck', 'Malvern Instruments', 'Thermo Fisher Scientific', 'Beckman Coulter', 'JEOL', 'Nikon', None, None, None, 'BioLegend']
11197382: ['Roche']
11197204: ['Thermo Fisher 

In [11]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['high-performance liquid chromatography (HPLC) (Shimadzu, LC-20AD)', 'SCIEX mass spectrometer (MS) (AB SCIEX, API 4000+)', 'PCR amplification instrument (ABI, 2720)', 'enzyme labeling instrument (BioTek, FLX800T)', 'electrophoresis apparatus (Beijing Liuyi Instrument Factory, DYY-6C)', 'gel imaging system (Beijing Bijing Biotechnology Co., LTD., BG-gdsAUTO130)', 'Nanodrop UV quantitative system (Thermo Fisher Scientific, NC2000)', 'Sequencer (Illumina, Novaseq6000)', 'fluorescence quantitative PCR instrument (Bio-rad Corporation, CFX)', 'vancomycin hydrochloride for injection (VIANEX S.A.)', 'bifidobacterium quadruplex viable tablets (Hangzhou Yuanda Biological Pharmaceutical)', 'NBP capsules (Shiyao Group Enbipu Pharmaceutical)', 'NBP reference (Shiyao Group Enbipu Pharmaceutical)', 'glipizide reference (Sichuan Vicchi Biochemical Technology)', 'Soil DNA Kit (Omega Bio-Tek)', 'Quant-iT PicoGreen dsDNA Assay Kit (ABI)', 'First Strand cDNA Synthesis Kit (Servicebio)', 'SYBR G

## Mistral

In [7]:
system_prompt = """You are an expert Company Identification system. 
Your task is to take a list of strings as input and identify the company that manufactures each product mentioned.
Strings provided in the list can be misleading; use your internal knowledge to identify company names.
If the manufacturer is unclear for any product, return a null value for that string.
Here is an example of the output format for a list of strings.

List of strings: ["PlayStation 5", "iPhone 15 Plus", "Pencil", "Stainless Steel Tumbler", "Whopper", "Amazon Speed ETD"]
Answer: ["Sony", "Apple", None, None, "Burger King", "Bruker"]

Only use this output format. Do not output anything besides this output format.
Output the manufacturer names in the order they occur in the input list of strings.
"""

user_prompt = """Q: Given the list of strings below, identify the companies that manufactures each product.

List of strings: {}
Answer:
"""

In [17]:
def company_identification(input_text):
    client = MistralClient(api_key=api_key)
    response = client.chat(
        model="mistral-large-latest",
        temperature=1e-16,
        messages=[
        ChatMessage(role="system", content=system_prompt),
        ChatMessage(role="user", content=user_prompt.format(input_text))
    ]
    )
    return response.choices[0].message.content

In [18]:
company_dict = {}
syntax_error = False
for key, value in product_dict.items():
    if value:
        identified_companies = company_identification(value)
        try:
            identified_companies = ast.literal_eval(identified_companies)
        except SyntaxError:
            syntax_error = True
            print(f"Syntax Error with ID {key}")
        if not syntax_error:
            company_dict[key] = identified_companies

In [19]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11197919: ['Ingredion', 'Ingredion', 'Ingredion', 'Tanita', None, 'QIAGEN', 'QIAGEN', 'QIAGEN', 'Bio-Rad Laboratories', 'Salimetrics', 'QIAGEN', 'Bio-Rad Laboratories', None, None, None, None, None, None]
11197476: ['None', 'None', 'None', 'Avestin', 'Merck Millipore Ltd.', 'None', 'None', 'None', 'JEOL Ltd.', 'Malvern Instruments Ltd.', 'Thermo Fisher Scientific', 'None', 'Beckman Coulter Inc.', 'None', 'None', 'Beyotime Biotechnology Co. Ltd.', 'None', 'Nikon Corporation', 'Beyotime Biotechnology Co. Ltd.', 'None', 'Affinity Biosciences', 'None', 'PerkinElmer Inc.', 'None', 'Biolegend Inc.', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']
11197204: ['Thermo Fisher Scientific', 'Thermo Fisher Scientific', None, 'Bio-Rad Laboratories', 'Bio-Rad Laboratories', None, 'GraphPad Software']
11197185: ['Tetra', 'Hemotek', 'Invitrogen', 'QIAGEN', 'Illumina', 'QuantaBio', None, 'Micr

### Mistral with Article 11198832

In [38]:
print('11198832', '["high-performance liquid chromatography (HPLC)" (Shimadzu, LC-20AD), "SCIEX mass spectrometer (MS)" (AB SCIEX, API 4000+), "PCR amplification instrument" (ABI, 2720), "enzyme labeling instrument" (BioTek, FLX800T), "electrophoresis apparatus" (Beijing Liuyi Instrument Factory, DYY-6C), "gel imaging system" (Beijing Bijing Biotechnology Co., LTD., BG-gdsAUTO130), "Nanodrop UV quantitative system" (Thermo Fisher Scientific, NC2000), "Sequencer" (Illumina, Novaseq6000), "fluorescence quantitative PCR instrument" (Bio-rad Corporation, CFX), "vancomycin hydrochloride for injection" (VIANEX S.A.), "bifidobacterium quadruplex viable tablets" (Hangzhou Yuanda Biological Pharmaceutical), "NBP capsules" (Shiyao Group Enbipu Pharmaceutical), "NBP reference" (Shiyao Group Enbipu Pharmaceutical.), "glipizide reference" (Sichuan Vicchi Biochemical Technology), "Soil DNA Kit" (Omega Bio-Tek), "Quant-iT PicoGreen dsDNA Assay Kit" (ABI), "First Strand cDNA Synthesis Kit" (Servicebio), "SYBR Green qPCR Master Mix" (Servicebio)]')

11198832 ["high-performance liquid chromatography (HPLC)" (Shimadzu, LC-20AD), "SCIEX mass spectrometer (MS)" (AB SCIEX, API 4000+), "PCR amplification instrument" (ABI, 2720), "enzyme labeling instrument" (BioTek, FLX800T), "electrophoresis apparatus" (Beijing Liuyi Instrument Factory, DYY-6C), "gel imaging system" (Beijing Bijing Biotechnology Co., LTD., BG-gdsAUTO130), "Nanodrop UV quantitative system" (Thermo Fisher Scientific, NC2000), "Sequencer" (Illumina, Novaseq6000), "fluorescence quantitative PCR instrument" (Bio-rad Corporation, CFX), "vancomycin hydrochloride for injection" (VIANEX S.A.), "bifidobacterium quadruplex viable tablets" (Hangzhou Yuanda Biological Pharmaceutical), "NBP capsules" (Shiyao Group Enbipu Pharmaceutical), "NBP reference" (Shiyao Group Enbipu Pharmaceutical.), "glipizide reference" (Sichuan Vicchi Biochemical Technology), "Soil DNA Kit" (Omega Bio-Tek), "Quant-iT PicoGreen dsDNA Assay Kit" (ABI), "First Strand cDNA Synthesis Kit" (Servicebio), "SYBR G

In [20]:
for key, value in product_dict.items():
    if value:
        print(f"{key}: {value}")

11197919: ['HI-MAIZE® 260 starch', 'VERSAFIBE™ 1490 starch', 'AMIOCA™ TF starch', 'Tanita SC-240 Total Body Composition Analyzer', 'Automated Self-Administered 24-hour (ASA24) Dietary Assessment Tool 2020', 'QIAamp 96 DNA Blood Kit', 'QIAamp Blood Mini Kit', 'QIAamp Investigator Kit', 'iTaq™ Universal SYBR® Green Supermix', 'Salimetrics Salivary Alpha-Amylase Enzymatic Kit', 'DNeasy PowerSoil 96 HTP Kit', 'Classic++™ Hot Start Taq DNA Polymerase Master Mix', 'Mag-Bind TotalPure', 'QIIME 2', 'DADA2', 'mafft', 'fasttree']
11197476: ['CXB@Lipo', 'chloroform', 'rotary evaporator', 'liposome extrude (LiposoFast, Avestin, Canada)', 'polycarbonate membrane filters (Merck Millipore Ltd., USA)', 'ultrafiltration centrifuge tube', 'ECN', 'glycol chitosan', 'transmission electron microscopy (TEM; JEM-1400, Japan)', 'Malvern Nano ZS ZEN3600 (Malvern, UK)', 'UV-VIS (Thermo Fisher Scientific, USA)', 'ICG@ECN (I@ECN)', 'flow cytometry (CytoFLEX, Beckman, USA)', 'UV‒VIS', 'pGEX-Pvhb-Lysis', 'cloning k

## Spacy

In [35]:
for key, value in company_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Thermo Fisher Scientific', 'Illumina', 'Thermo Fisher Scientific', 'LI-COR Biosciences']
11198784: ['Cytiva', 'Bio-Rad', 'STEMCELL Technologies', 'Narishige', 'Molecular Devices']
11197919: ['Tanita', 'Qiagen', 'Qiagen', 'Bio-Rad', 'Qiagen', 'BioSpec Products']
11197476: ['Malvern Panalytical']
11197204: ['Thermo Fisher Scientific', None, 'Bio-Rad', 'Bio-Rad']
11197185: ['Illumina', None, 'Thermo Fisher Scientific', None, 'Illumina']
11196966: ['Piramal', 'Kent Scientific', None, None, 'Qiagen', 'New England Biolabs', 'Qiagen', 'Qiagen', 'Thermo Fisher Scientific', 'Azure Biosystems', 'Zeiss', 'Vector Laboratories', 'Thermo Fisher Scientific', 'Vector Laboratories']
11196847: ['Thermo Fisher Scientific', None]
11196703: [None, None]
11196417: ['TransGen Biotech', 'Thermo Fisher Scientific', 'TransGen Biotech', 'Vazyme', 'Thermo Fisher Scientific']
11196393: ['Becton Dickinson', None, 'GraphPad Software']
11196377: ['Konica Minolta']
11196265: ['Canon']
11196252: ['Metrohm',

In [36]:
for key, value in cleaned_entity_dict.items():
    if value:
        print(f"{key}: {value}")

11198832: ['Nanodrop UV', 'Novaseq6000', 'Quant-iT PicoGreen dsDNA Assay Kit', 'The Odyssey CLx Imaging System']
11198784: ['Ficoll-Paque Premium', 'TC20 Automated cell counter', 'EasySep Negative Human NK Cell Isolation Kit', 'Micro Forge MF-900', 'Axopatch 200B']
11197919: ['Tanita SC-240 Total Body Composition Analyzer', 'QIAamp 96 DNA Blood Kit', 'QIAamp Investigator Kit', 'QX100 Droplet Digital PCR System', 'DNeasy PowerSoil 96 HTP Kit', 'BioSpec 1001 Mini-Beadbeater-96']
11197476: ['Malvern Nano ZS ZEN3600']
11197204: ['TRIzol', 'Easy Oligos', 'iTaq Universal One-Step', 'iTaq™ Universal SYBR® Green One-Step']
11197185: ['MiSeq', 'Hemotek', 'Trizol', 'AccuStart II', 'MiSeq 500']
11196966: ['Isoflurane Piramal', 'SomnoSuite® Low-Flow Anesthesia System', 'Liver Dissociation Kit', 'Tumor Dissociation Kit', 'RNeasy Mini Kit', 'ProtoScript II First Strand cDNA Synthesis Kit', 'DNeasy Blood & Tissue Kit', 'QuantiFast SYBR® Green PCR Kit', 'Pierce BCA-Kit', 'Azure Ao Microplate Reader', 