In [4]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option("display.max_columns", None)


In [5]:
nanozymes_df = pd.read_excel("nanozymes.xlsx")


def extract_id_from_filename(filename):
    return filename[:-3].upper()


def extract_id_from_link(link):
    return link.split("/")[-1].upper()


nanozymes_df["article_id"] = nanozymes_df["link"].apply(extract_id_from_link)

column_rename_map = {
    "Syngony": "syngony",
    "length, nm": "length",
    "width, nm": "width",
    "depth, nm": "depth",
    "pol": "polymer",
    "surf": "surfactants",
    "Mw(coat), g/mol": "molar_mass",
    "Km, mM": "km",
    "Vmax, mM/s": "v_max",
    "ReactionType": "reaction_type",
    "C min, mM": "c_min",
    "C max, mM": "c_max",
    "C(const), mM": "c_const",
    "Ccat(mg/mL)": "ccat",
    "ph": "ph",
    "temp, °C": "temperature",
}

nanozymes_df.rename(columns=column_rename_map, inplace=True)

num_cols = [
    "syngony",
    "length",
    "width",
    "depth",
    "molar_mass",
    "km",
    "v_max",
    "c_min",
    "c_max",
    "c_const",
    "ccat",
    "ph",
    "temperature",
]
for col in num_cols:
    nanozymes_df[col] = pd.to_numeric(nanozymes_df[col], errors="coerce")

In [6]:
nanozymes_df

Unnamed: 0,formula,activity,syngony,length,width,depth,surface,polymer,surfactants,molar_mass,km,v_max,reaction_type,c_min,c_max,c_const,ccat,ph,temperature,link,article_id
0,CoFe2O4,peroxidase,7.0,4.1,4.1,4.1,naked,oleic acid,0,282.47,0.00645,1.376300,TMB + H2O2,1.500,1.50,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
1,CoFe2O4,peroxidase,7.0,13.8,13.8,13.8,naked,oleic acid,0,282.47,0.05537,0.264300,TMB + H2O2,1.500,1.50,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
2,CoFe2O4,peroxidase,7.0,4.1,4.1,4.1,naked,oleic acid,0,282.47,0.03551,8.363000,H2O2 + TMB,0.500,25.00,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
3,CoFe2O4,peroxidase,7.0,13.8,13.8,13.8,naked,oleic acid,0,282.47,0.22769,0.438200,H2O2 + TMB,0.500,25.00,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
4,CoFe2O4,peroxidase,7.0,24.5,24.5,24.5,naked,oleic acid,0,282.47,0.01725,1.027200,TMB + H2O2,0.200,100.00,15.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,Ti3C2,peroxidase,6.0,,,,ssDNA,DNA,Tetramethylammonium hydroxide,,0.85600,0.000080,OPD + H2O2,0.001,2.00,10.00,0.005000,4.0,40.0,https://doi.org/10.1016/j.microc.2021.106238,J.MICROC.2021.106238
1174,Au,oxidase,7.0,13.0,13.0,13.0,ss-DNA,0,0,0.00,17.67000,0.000180,glucose,5.000,25.00,,,,34.0,https://doi.org/10.1002/anie.201105121,ANIE.201105121
1175,Au,oxidase,7.0,13.0,13.0,13.0,ds-DNA,0,0,0.00,6.98000,0.000530,glucose,5.000,25.00,,,,34.0,https://doi.org/10.1002/anie.201105121,ANIE.201105121
1176,Au,oxidase,7.0,13.0,13.0,13.0,DNA,0,0,,6.97000,0.000630,glucose,0.010,0.25,,,7.2,25.0,https://doi.org/10.1021/nn102592h,NN102592H


In [8]:
nanozymes_df.drop_duplicates()

Unnamed: 0,formula,activity,syngony,length,width,depth,surface,polymer,surfactants,molar_mass,km,v_max,reaction_type,c_min,c_max,c_const,ccat,ph,temperature,link,article_id
0,CoFe2O4,peroxidase,7.0,4.1,4.1,4.1,naked,oleic acid,0,282.47,0.00645,1.376300,TMB + H2O2,1.500,1.50,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
1,CoFe2O4,peroxidase,7.0,13.8,13.8,13.8,naked,oleic acid,0,282.47,0.05537,0.264300,TMB + H2O2,1.500,1.50,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
2,CoFe2O4,peroxidase,7.0,4.1,4.1,4.1,naked,oleic acid,0,282.47,0.03551,8.363000,H2O2 + TMB,0.500,25.00,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
3,CoFe2O4,peroxidase,7.0,13.8,13.8,13.8,naked,oleic acid,0,282.47,0.22769,0.438200,H2O2 + TMB,0.500,25.00,100.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
4,CoFe2O4,peroxidase,7.0,24.5,24.5,24.5,naked,oleic acid,0,282.47,0.01725,1.027200,TMB + H2O2,0.200,100.00,15.00,0.000026,4.0,37.0,https://doi.org/10.1039/C4RA15675G,C4RA15675G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,Ti3C2,peroxidase,6.0,,,,ssDNA,DNA,Tetramethylammonium hydroxide,,0.85600,0.000080,OPD + H2O2,0.001,2.00,10.00,0.005000,4.0,40.0,https://doi.org/10.1016/j.microc.2021.106238,J.MICROC.2021.106238
1174,Au,oxidase,7.0,13.0,13.0,13.0,ss-DNA,0,0,0.00,17.67000,0.000180,glucose,5.000,25.00,,,,34.0,https://doi.org/10.1002/anie.201105121,ANIE.201105121
1175,Au,oxidase,7.0,13.0,13.0,13.0,ds-DNA,0,0,0.00,6.98000,0.000530,glucose,5.000,25.00,,,,34.0,https://doi.org/10.1002/anie.201105121,ANIE.201105121
1176,Au,oxidase,7.0,13.0,13.0,13.0,DNA,0,0,,6.97000,0.000630,glucose,0.010,0.25,,,7.2,25.0,https://doi.org/10.1021/nn102592h,NN102592H


In [11]:
len(set(nanozymes_df["link"]))

406

In [2]:
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel

load_dotenv(override=True)


class Experiment(BaseModel):
    formula: str
    activity: str
    syngony: float
    length: float
    width: float
    depth: float
    surface: str
    polymer: str
    surfactants: str
    molar_mass: float
    km: float
    v_max: float
    reaction_type: str
    c_min: float
    c_max: float
    c_const: float
    ccat: float
    ph: float
    temperature: float
    experiment_title: str


class Response(BaseModel):
    experiments: list[Experiment]


client = OpenAI()

prompt = """You are an expert in the field of nanozymes. Your task is to analyze the provided text and extract a structured list of experiments. For each experiment, extract the following parameters, converting units where necessary:

- **formula**: The chemical formula of the nanozyme. Do not use subscripts or superscripts; write formulas in plain text as a human would (e.g., H2O2).
- **activity**: The type of activity (e.g., peroxidase, oxidase). Write the activity in lowercase.
- **syngony**: The crystal system or symmetry. Convert this into the following categories:  
  0 - amorphous  
  1 - triclinic  
  2 - monoclinic  
  3 - orthorhombic  
  4 - tetragonal  
  5 - trigonal  
  6 - hexagonal  
  7 - cubic  
  If no clear match can be made, use the most frequently occurring category from other experiments. If no clear matches in other experiments, choose the one that is most similar in meaning.
- **length**: The size of the nanozyme in nanometers (nm).
- **width**: The width of the nanozyme in nanometers (nm).
- **depth**: The depth of the nanozyme in nanometers (nm).
- **surface**: The surface chemistry of the nanozyme. If not specified, use the default value `naked`.
- **polymer**: The polymer used in the synthesis. If not specified, use the default value `0`.
- **surfactants**: The surfactant used in the synthesis. If not specified, use the default value `0`.
- **molar_mass**: The molar mass in grams per mole (g/mol).
- **km**: The Michaelis constant Km in millimoles per liter (mM).
- **v_max**: The maximum reaction rate Vmax in millimoles per second (mM/s).
- **reaction_type**: The type of reaction, where the first component is the substrate and the second is the co-substrate. For example, TMB + H2O2 and H2O2 + TMB should be treated as different reaction types.
- **c_min**: The minimum concentration of the substrate in millimoles per liter (mM).
- **c_max**: The maximum concentration of the substrate in millimoles per liter (mM).
- **c_const**: The concentration of the co-substrate in millimoles per liter (mM).
- **ccat**: The concentration of nanoparticles in the measurement of catalytic activity in milligrams per milliliter (mg/mL).
- **ph**: The pH at which the catalytic activity was measured.
- **temperature**: The temperature at which the research was carried out in degrees Celsius (°C).

If any numerical value is missing, please use `nan` to indicate its absence. Ensure that all parameters are accurately extracted and categorized where applicable, and take note that the order of components in the reaction type (substrate + co-substrate) matters. Also, remember to write chemical formulas plainly without subscripts or superscripts (e.g., H2O2 instead of \(H_2O_2\)).

If the text contains a reaction type written in both forms, such as:
- **TMB + H2O2**: TMB as substrate, H2O2 as co-substrate
- **H2O2 + TMB**: H2O2 as substrate, TMB as co-substrate

You must split this experiment into two distinct experiments, one for each reaction type, reflecting the change in the order of the substrate and co-substrate.

If the surface chemistry is not specified, use `naked` as the default value. For both `polymer` and `surfactants`, if they are not mentioned, assign the value `0`.

If the syngony category cannot be matched directly, use the most frequently occurring category from other experiments or select the category that is the closest match in meaning.

Ensure that all activities are written in lowercase.
"""

In [149]:
assistant_df_list = []
failed_md_list = []
folder_path = "./markdown_answers"
for article_name in tqdm(os.listdir(folder_path)):
    with open(
        os.path.join(folder_path, article_name),
        "r",
        encoding="utf-8",
    ) as file:
        content = file.read()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"Text:\n{content}"},
        ],
        response_format=Response,
    )
    message = completion.choices[0].message
    if not message.parsed:
        print(article_name, message.refusal)
        failed_md_list.append(article_name)
        continue
    data = [experiment.dict() for experiment in message.parsed.experiments]
    article_df = pd.DataFrame(data)
    article_df["assistant_answer"] = article_name
    article_df["article_id"] = extract_id_from_filename(article_name)
    assistant_df_list.append(article_df)

assistant_df = pd.concat(assistant_df_list).reset_index(drop=True)

100%|██████████| 403/403 [48:22<00:00,  7.20s/it] 


In [150]:
failed_md_list

[]

In [None]:
assistant_df

Unnamed: 0,formula,activity,syngony,length,width,depth,surface,polymer,surfactants,molar_mass,km,v_max,reaction_type,c_min,c_max,c_const,ccat,ph,temperature,experiment_title,assistant_answer,article_id
0,MnO2,oxidase,0.0,0.0,0.0,0.0,naked,0,0,0.000,0.03000,0.046000,ABTS + H2O2,0.005,1.0,0.0,0.0100,3.8,25.0,MnO2 Nanozyme with ABTS,03067319.2019.1599875.md,03067319.2019.1599875
1,MnO2,oxidase,0.0,0.0,0.0,0.0,naked,0,0,0.000,0.02700,0.113000,TMB + H2O2,0.005,1.0,0.0,0.0100,3.8,25.0,MnO2 Nanozyme with TMB,03067319.2019.1599875.md,03067319.2019.1599875
2,Fe3O4-DOPA,peroxidase,7.0,3.5,3.5,3.5,dopamine-capped,0,0,0.000,0.00275,0.000014,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 1: Fe₃O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
3,CoFe2O4-DOPA,peroxidase,7.0,2.2,2.2,2.2,dopamine-capped,0,0,0.000,0.00521,0.000239,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 2: CoFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
4,CuFe2O4-DOPA,peroxidase,7.0,2.3,2.3,2.3,dopamine-capped,0,0,0.000,0.00118,0.000082,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 3: CuFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,AuPt,peroxidase-like,7.0,23.6,23.6,23.6,naked,0,ascorbic acid,0.000,0.00000,0.000000,TMB + H2O2,0.000,0.0,100.0,0.0013,0.0,25.0,Experiment 4,srep40103.md,SREP40103
1638,AuPt,oxidase-like,7.0,23.6,23.6,23.6,naked,0,ascorbic acid,0.000,0.00000,0.000000,TMB,0.000,0.0,0.0,0.0013,0.0,25.0,Experiment 5,srep40103.md,SREP40103
1639,CuO,peroxidase,7.0,50.0,20.0,20.0,naked,0,0,79.545,0.50000,1.200000,H2O2 + TMB,0.100,5.0,1.0,0.0500,7.0,25.0,Experiment 1: CuO Nanozyme,thno.19257.md,THNO.19257
1640,Fe3O4,oxidase,7.0,15.0,15.0,15.0,PEG,PEG,0,231.532,3.50000,3.000000,TMB + NH3,0.010,10.0,0.1,0.1000,8.0,37.0,Experiment 2: Fe3O4 Nanozyme,thno.19257.md,THNO.19257


In [161]:
assistant_df = assistant_df.drop_duplicates(
    subset=[
        "formula",
        "activity",
        "syngony",
        "length",
        "width",
        "depth",
        "surface",
        "polymer",
        "surfactants",
        "molar_mass",
        "km",
        "v_max",
        "reaction_type",
        "c_min",
        "c_max",
        "c_const",
        "ccat",
        "ph",
        "temperature",
        "assistant_answer",
        "article_id",
    ]
).reset_index(drop=True)

In [164]:
assistant_df.to_csv("assistant_df_21_08_2024_v2.csv", index=False)

In [12]:
assistant_df = pd.read_csv("assistant_df_21_08_2024_v2.csv")

In [13]:
assistant_df

Unnamed: 0,formula,activity,syngony,length,width,depth,surface,polymer,surfactants,molar_mass,km,v_max,reaction_type,c_min,c_max,c_const,ccat,ph,temperature,experiment_title,assistant_answer,article_id
0,MnO2,oxidase,0.0,0.0,0.0,0.0,naked,0,0,0.000,0.03000,0.046000,ABTS + H2O2,0.005,1.0,0.0,0.0100,3.8,25.0,MnO2 Nanozyme with ABTS,03067319.2019.1599875.md,03067319.2019.1599875
1,MnO2,oxidase,0.0,0.0,0.0,0.0,naked,0,0,0.000,0.02700,0.113000,TMB + H2O2,0.005,1.0,0.0,0.0100,3.8,25.0,MnO2 Nanozyme with TMB,03067319.2019.1599875.md,03067319.2019.1599875
2,Fe3O4-DOPA,peroxidase,7.0,3.5,3.5,3.5,dopamine-capped,0,0,0.000,0.00275,0.000014,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 1: Fe₃O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
3,CoFe2O4-DOPA,peroxidase,7.0,2.2,2.2,2.2,dopamine-capped,0,0,0.000,0.00521,0.000239,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 2: CoFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
4,CuFe2O4-DOPA,peroxidase,7.0,2.3,2.3,2.3,dopamine-capped,0,0,0.000,0.00118,0.000082,H2O2 + ABTS,0.000,0.0,100.0,0.0200,4.6,25.0,Experiment 3: CuFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494,AuPt,oxidase-like,7.0,23.6,23.6,23.6,naked,0,ascorbic acid,0.000,0.00000,0.000000,TMB,0.000,0.0,0.0,0.0013,0.0,25.0,Experiment 2,srep40103.md,SREP40103
1495,AuPt,ascorbic acid oxidase-like,7.0,23.6,23.6,23.6,naked,0,ascorbic acid,0.000,0.00000,0.000000,ascorbic acid,0.000,0.0,0.0,0.0013,0.0,25.0,Experiment 3,srep40103.md,SREP40103
1496,CuO,peroxidase,7.0,50.0,20.0,20.0,naked,0,0,79.545,0.50000,1.200000,H2O2 + TMB,0.100,5.0,1.0,0.0500,7.0,25.0,Experiment 1: CuO Nanozyme,thno.19257.md,THNO.19257
1497,Fe3O4,oxidase,7.0,15.0,15.0,15.0,PEG,PEG,0,231.532,3.50000,3.000000,TMB + NH3,0.010,10.0,0.1,0.1000,8.0,37.0,Experiment 2: Fe3O4 Nanozyme,thno.19257.md,THNO.19257


In [17]:
list(set(assistant_df["article_id"]))

['C8CC07800A',
 'C1CY00124H',
 'C8NJ00097B',
 '1361-6528.AADDC2',
 'ACSAMI.0C12593',
 'J.BIOMATERIALS.2010.09.040',
 'C5RA07636F',
 'ANGE.201904751',
 'J.SNB.2016.07.168',
 'SMLL.201903182',
 'J.BIOS.2018.08.004',
 'J.SAA.2019.117412',
 'C4NR04115A',
 'C8TB01948G',
 'J.TALANTA.2020.121680',
 'C6RA00368K',
 'J.APSUSC.2016.12.067',
 'ANIE.200805279',
 'ACS.LANGMUIR.7B03430',
 'J.MSEC.2015.10.046',
 'CHEM.202100567',
 'C6CC00194G',
 'ACSANM.3C01652',
 'C8CC07062H',
 'J.CEJ.2017.08.026',
 'ACS.INORGCHEM.0C03355',
 'D3NJ00136A',
 'J.JTICE.2021.03.029',
 'NANO9020210',
 'IE403554V',
 'J.MOLCATA.2013.05.016',
 'C7AY00750G',
 'CHEM.201001789',
 'ANIE.201909729',
 'J.JALLCOM.2016.04.269',
 'J.ACA.2020.01.035',
 'J.SAA.2017.06.006',
 'J.JCIS.2019.01.061',
 'J.COLSURFB.2017.02.004',
 'J.ACA.2012.11.056',
 'AOC.4465',
 'ACSAMI.0C01789',
 'C5RA11014A',
 'S00604-017-2552-1',
 'J.SNB.2019.04.020',
 'J.TALANTA.2013.01.032',
 'S41467-021-23737-1',
 'ACSANM.8B00945',
 'J.JALLCOM.2019.01.225',
 'SREP4010

In [18]:
def jaccard_index_with_floats(list1, list2, epsilon=0.01):
    a1 = []
    a2 = list1.copy()
    a3 = list2.copy()

    for x in list1:
        matched = False
        for y in a3:
            if np.isclose(x, y, atol=epsilon):
                a1.append(x)
                a2.remove(x)
                a3.remove(y)
                matched = True
                break

    numerator = len(a1)
    denominator = len(a1) + len(a2) + len(a3)

    if denominator == 0:
        return 0.0

    jaccard_coefficient = numerator / denominator

    return jaccard_coefficient, a1, a2, a3

Коэффициент Жаккара: 0.42857142857142855
Сопоставленные элементы (a1): [2.00001, 2.0001, 3.0]
Несопоставленные элементы из list1 (a2): [1.0001, 4.0]
Несопоставленные элементы из list2 (a3): [3.0001, 5.0]


In [32]:
def jaccard_index_with_strings(list1, list2):
    a1 = []
    a2 = list1.copy()
    a3 = list2.copy()

    for x in list1:
        matched = False
        for y in a3:
            if str(x).upper() == str(y).upper():
                a1.append(x)
                a2.remove(x)
                a3.remove(y)
                matched = True
                break

    numerator = len(a1)
    denominator = len(a1) + len(a2) + len(a3)

    if denominator == 0:
        return 0.0

    jaccard_coefficient = numerator / denominator

    return jaccard_coefficient, a1, a2, a3


Коэффициент Жаккара: 0.42857142857142855
Сопоставленные элементы (a1): ['banana', 'banana', 'orange']
Несопоставленные элементы из list1 (a2): ['apple', 'apple']
Несопоставленные элементы из list2 (a3): ['grape', 'banana']


In [35]:
assistant_df["polymer"] = assistant_df["polymer"].astype(str)
assistant_df["surfactants"] = assistant_df["surfactants"].astype(str)

In [37]:
import pandas as pd


def calculate_jaccard_index(df1, df2, epsilon=0.02):
    df1 = df1.copy()
    df2 = df2.copy()

    common_columns = df1.columns.intersection(df2.columns).tolist()
    compare_columns = common_columns

    unmatched_df1 = []
    unmatched_df2 = []

    df1_matched = pd.Series([False] * len(df1))
    df2_matched = pd.Series([False] * len(df2))

    jaccard_indexes = {}

    for article in list(set(df1["article_id"])):
        df1_rows = df1[df1["article_id"] == article]
        df2_rows = df2[df2["article_id"] == article]

        if len(df1_rows) > 0 and len(df2_rows) > 0:
            jaccard_index_article = {}
            for col in compare_columns:
                df1_list = list(set(df1_rows[col]))
                df2_list = list(set(df2_rows[col]))
                if pd.api.types.is_numeric_dtype(df1[col]):
                    jaccard_index_article[col], _, _, _ = jaccard_index_with_floats(
                        df1_list, df2_list, epsilon
                    )
                else:
                    jaccard_index_article[col], _, _, _ = jaccard_index_with_strings(
                        df1_list, df2_list
                    )
            for idx in df2_rows.index:
                df2_matched[idx] = True
            for idx in df1_rows.index:
                df1_matched[idx] = True
            jaccard_indexes[article] = jaccard_index_article

    unmatched_df1 = df1[~df1_matched]
    unmatched_df2 = df2[~df2_matched]

    unmatched_df1 = pd.DataFrame(unmatched_df1)
    unmatched_df2 = pd.DataFrame(unmatched_df2)

    return jaccard_indexes, unmatched_df1, unmatched_df2, common_columns


jaccard_indexes, unmatched_df1, unmatched_df2, common_columns = calculate_jaccard_index(
    assistant_df, nanozymes_df
)

In [38]:
jaccard_indexes_list = list(jaccard_indexes.values())

In [39]:
mean_indexes = {}

for col in common_columns:
    mean_indexes[col] = 0

for indexes in jaccard_indexes_list:
    for col in common_columns:
        mean_indexes[col] += indexes[col]

for col in common_columns:
    mean_indexes[col] = mean_indexes[col] / len(jaccard_indexes_list)

In [40]:
mean_indexes

{'formula': 0.535274060989903,
 'activity': 0.6970899470899472,
 'syngony': 0.5389440035273368,
 'length': 0.44622071050642476,
 'width': 0.45354728311077513,
 'depth': 0.3908929621231209,
 'surface': 0.6679413179413181,
 'polymer': 0.6355379188712522,
 'surfactants': 0.675573192239859,
 'molar_mass': 0.42255605946082136,
 'km': 0.5747270674317294,
 'v_max': 0.5214783072291425,
 'reaction_type': 0.6090828924162255,
 'c_min': 0.44175485008818344,
 'c_max': 0.2047650541698161,
 'c_const': 0.3758818342151676,
 'ccat': 0.5052910052910055,
 'ph': 0.773809523809524,
 'temperature': 0.6014109347442682,
 'article_id': 1.0}

In [68]:
unmatched_df1

Unnamed: 0,formula,activity,syngony,length,width,depth,surface,polymer,surfactants,molar_mass,km,v_max,reaction_type,c_min,c_max,c_const,ccat,ph,temperature,experiment_title,assistant_answer,article_id
2,Fe3O4-DOPA,peroxidase,7.0,3.5,3.5,3.5,dopamine-capped,0,0,0.0,0.00275,0.000014,H2O2 + ABTS,0.00,0.0,100.00,0.02000,4.6,25.0,Experiment 1: Fe₃O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
3,CoFe2O4-DOPA,peroxidase,7.0,2.2,2.2,2.2,dopamine-capped,0,0,0.0,0.00521,0.000239,H2O2 + ABTS,0.00,0.0,100.00,0.02000,4.6,25.0,Experiment 2: CoFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
4,CuFe2O4-DOPA,peroxidase,7.0,2.3,2.3,2.3,dopamine-capped,0,0,0.0,0.00118,0.000082,H2O2 + ABTS,0.00,0.0,100.00,0.02000,4.6,25.0,Experiment 3: CuFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
5,MnFe2O4-DOPA,peroxidase,7.0,2.7,2.7,2.7,dopamine-capped,0,0,0.0,0.00034,0.000119,H2O2 + ABTS,0.00,0.0,100.00,0.02000,4.6,25.0,Experiment 4: MnFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
6,NiFe2O4-DOPA,peroxidase,7.0,2.6,2.6,2.6,dopamine-capped,0,0,0.0,0.00034,0.000040,H2O2 + ABTS,0.00,0.0,100.00,0.02000,4.6,25.0,Experiment 5: NiFe₂O₄-DOPA Nanozymes,1361-6463.aa5bf6.md,1361-6463.AA5BF6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,Pt,laccase,0.0,-1.0,-1.0,-1.0,DNA-stabilized,Oligonucleotide (T10),0,-1.0,-1.00000,-1.000000,"2,4-DCP + 4-AAP",0.05,2.0,0.04,-1.00000,7.0,25.0,T10-Templated Pt Nanozyme,s10562-017-2106-5.md,S10562-017-2106-5
1428,Ru,peroxidase,7.0,6.2,6.2,1.8,naked,poly(vinylpyrrolidone),L-ascorbic acid,0.0,0.06030,0.000134,TMB + H2O2,0.00,0.0,2000.00,0.00068,4.0,22.0,Ruthenium (Ru) Nanoframes: TMB + H2O2,s11434-016-1193-9.md,S11434-016-1193-9
1429,Ru,peroxidase,7.0,6.2,6.2,1.8,naked,poly(vinylpyrrolidone),L-ascorbic acid,0.0,318.00000,0.000074,H2O2 + TMB,0.00,0.0,0.80,0.00068,4.0,22.0,Ruthenium (Ru) Nanoframes: H2O2 + TMB,s11434-016-1193-9.md,S11434-016-1193-9
1448,V2O5,peroxidase,3.0,500.0,0.0,0.0,naked,0,0,0.0,0.73800,0.018500,TMB + H2O2,0.00,0.0,0.00,1.00000,4.0,25.0,Experiment 1: V2O5 Nanozymes with TMB as Subst...,s16040584.md,S16040584


In [71]:
list(set(unmatched_df1["article_id"]))

['D0CC04101G(1)',
 'J.ACA.2015.04.052',
 'S00604-017-2552-1',
 'S10562-017-2106-5',
 'C2AN35700C(1)',
 'J.ACA.2012.11.056',
 'J.BIOS.2016.10.082',
 'S11434-016-1193-9',
 'J.BIOMATERIALS.2010.09.040',
 'J.JCIS.2018.12.093',
 'ANIE.201105121',
 'CHEMOSENSORS1000359',
 'C1CC11943E',
 '1361-6463.AA5BF6',
 'J.SNB.2014.12.052',
 'J.SNB.2021.130266',
 '1361-6528.AADDC2',
 'S16040584',
 'S0039914021005683',
 'J.SNB.2017.07.108',
 'J.ACA.2016.10.013']

In [70]:
list(set(unmatched_df2["article_id"]))

['J.JCIS.2017.11.064 ',
 'C7AY02459B',
 'J.BIOS.2016.10.082 ',
 'C1CC11943E ',
 'S16040584 ',
 'ANIE.201105121 ',
 'S11434-016-1193-9  ',
 'J.ACA.2012.11.056  ',
 'C8TB01132J',
 'C7TB02434G',
 'J.SNB.2021.130266 ',
 'S00604-017-2552-1 ',
 'J.JCIS.2018.12.093 ',
 'AADDC2',
 'J.JIEC.2021.09.034',
 'S10562-017-2106-5    ',
 'J.ACA.2015.04.052 ',
 'J.SNB.2014.12.052\xa0    ',
 'CHEMOSENSORS10090359',
 'D0CC04101G ',
 'C2AN35700C ',
 'J.BIOMATERIALS.2010.09.040 ',
 'J.SNB.2017.07.108  ',
 'J.ACA.2016.10.013 ',
 'C1JM14253D',
 'AA5BF6']