# Post-Processing of LLM raw outputs to enable performance evaluation

In [None]:
import pandas as pd
import ast

In [4]:
raw_output = pd.read_parquet(r'data/raw/outputs.parquet')
raw_output

Unnamed: 0,google/gemini-2.5-pro-preview-03-25,openai/gpt-4.1
0,"{'input': {'doi': '10.1002/cbdv.200800342', 't...","{'input': {'doi': '10.1002/cbdv.200800342', 't..."
1,"{'input': {'doi': '10.1093/mutage/gep034', 'te...","{'input': {'doi': '10.1093/mutage/gep034', 'te..."
2,{'input': {'doi': '10.1016/j.phytochem.2008.01...,{'input': {'doi': '10.1016/j.phytochem.2008.01...
3,"{'input': {'doi': '10.1021/np9004079', 'text':...","{'input': {'doi': '10.1021/np9004079', 'text':..."
4,"{'input': {'doi': '10.1002/ps.1278', 'text': ""...","{'input': {'doi': '10.1002/ps.1278', 'text': ""..."
...,...,...
5,"{'input': {'doi': '10.1055/s-0029-1185364', 't...","{'input': {'doi': '10.1055/s-0029-1185364', 't..."
6,"{'input': {'doi': '10.1021/np1007476', 'text':...","{'input': {'doi': '10.1021/np1007476', 'text':..."
7,"{'input': {'doi': '10.1021/np800612x', 'text':...","{'input': {'doi': '10.1021/np800612x', 'text':..."
8,{'input': {'doi': '10.1590/S0103-5053200500080...,{'input': {'doi': '10.1590/S0103-5053200500080...


In [8]:
cleaned_outputs = []
models = raw_output.columns.tolist()
for model in models:
    responses = raw_output[model]
    print(f'Cleaning outputs of model {model}...')
    i = 0
    for response in responses:
        i += 1
        if response == '-':
            continue

        output = (
            str(ast.literal_eval(response)['output'])
            .replace('```json', '')
            .replace('```', '')
            .replace('EOF', '')
            .replace('// Cabraleadiol, Ocotillone', '')
            .replace('// Carapa guianensis (Gedunin, 7-Deacetoxy-7-oxogedunin, 6-Acetoxygedunin)', '')
            .replace('// Cedrela odorata, Toona ciliata, Hortia oreadica (exact site not specified, defaulting per known list)', '')
            .replace('Explanation:', '')
            .replace('- The compounds isolated and discussed in this paper do not match any of the compound names from the provided list.', '')
            .replace('- No information on biological activity was reported in the provided text.', '')
            .replace('- Collection species is "Neoraputia alba (Rutaceae)", matching the available options.', '')
            .replace('- Collection type is "Plant Isolated".', '')
            .replace('- Collection site is Espirito Santo, which matches as "N/A/ES".', '')
            .replace("Here's the structured information extracted from the paper:", '')
            
        )
        output_di = ast.literal_eval(output)
        if type(output_di) == list:
            output_di = output_di[0]
        output_di['model'] = model
        cleaned_outputs.append(output_di)

results = pd.DataFrame(cleaned_outputs)
results

Cleaning outputs of model google/gemini-2.5-pro-preview-03-25...
Cleaning outputs of model openai/gpt-4.1...


Unnamed: 0,doi,name,biological_activity,collection_specie,collection_type,collection_site,model
0,10.1002@cbdv.200800342,"[Casearin X; (1R*,3S*,5S*,6aR*,7S*,8S*,10R*,10...",[Cytotoxic],[Casearia sylvestris (Flacourtiaceae)],[Plant Isolated],[Sao Miguel Arcanjo/SP],google/gemini-2.5-pro-preview-03-25
1,10.1093/mutage/gep034,[Caseargrewiin F],"[Mutagenic, Genotoxic]",[Casearia sylvestris (Flacourtiaceae)],[Plant Isolated],[Araraquara/SP],google/gemini-2.5-pro-preview-03-25
2,10.1016@j.phytochem.2008.01.006,[Pterogynoside; Kaempferol-3-O-(3''-O-4'''-met...,"[Inhibition of Myeloperoxodase, Antioxidant]",[Pterogyne nitens (Fabaceae)],Plant Isolated,Sao Paulo/SP,google/gemini-2.5-pro-preview-03-25
3,10.1021/np9004079,"[Caseobliquin A, Rel-6β-hydroxyzuelanin-2β-ben...",[Cytotoxic],[Casearia obliqua (Flacourtiaceae)],[Plant Isolated],[Campinas/SP],google/gemini-2.5-pro-preview-03-25
4,10.1002/ps.1278,"[2,4,6-trimethoxybenzoic acid, 20-hydroxyecdys...",[Insecticidal],"[Vitex polygama (Verbenaceae), Siphoneugena de...",[Plant Isolated],[Pocos De Caldas/MG],google/gemini-2.5-pro-preview-03-25
...,...,...,...,...,...,...,...
281,10.1055/s-0029-1185364,"[3,4-dihydro-5-hydroxy-2,7-dimethyl-8-(2""-meth...","[Antitrypanosomal, Cytotoxic]",[Peperomia obtusifolia (Piperaceae)],[Plant Isolated],[Pocos De Caldas/MG],openai/gpt-4.1
282,10.1021/np1007476,[],[],[Chimarrhis turbinata (Rubiaceae)],[Plant Isolated],[Belem/PA],openai/gpt-4.1
283,10.1021/np800612x,"[Nitensidine D, Nitensidine E]","[Cytotoxic, Anticancer]",[Pterogyne nitens (Fabaceae)],[Plant Isolated],[Sao Paulo/SP],openai/gpt-4.1
284,10.1590/S0103-50532005000800008,"[Quercetin-3-O-rutinoside, Kaempferol-3-O-ruti...",[Antioxidant],[Chimarrhis turbinata (Rubiaceae)],[Plant Isolated],[Belem/PA],openai/gpt-4.1


# We explode the dataset to prepare for hits@k evaluation

In [11]:
expl_results = (
    results
    [['doi', 'name', 'biological_activity', 'collection_specie', 'collection_type', 'collection_site', 'model']]
    .fillna('[]')
    .explode('name')
    .explode('biological_activity')
    .explode('collection_specie')
    .explode('collection_type')
    .explode('collection_site')
)
expl_results['doi'] = expl_results.doi.astype(str).str.replace('@', '/').str.lower().str.replace('.pdf', '')
expl_results = expl_results[expl_results.doi != '[]']
expl_results

Unnamed: 0,doi,name,biological_activity,collection_specie,collection_type,collection_site,model
0,10.1002/cbdv.200800342,"Casearin X; (1R*,3S*,5S*,6aR*,7S*,8S*,10R*,10a...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin D; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin L; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin O; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Caseargrewiin F; (1R*,3S*,5S*,6aR*,7S*,8R*,10R...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
...,...,...,...,...,...,...,...
285,10.1016/j.cbi.2010.08.008,Casearin L,Anticancer,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,openai/gpt-4.1
285,10.1016/j.cbi.2010.08.008,Casearin O,Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,openai/gpt-4.1
285,10.1016/j.cbi.2010.08.008,Casearin O,Anticancer,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,openai/gpt-4.1
285,10.1016/j.cbi.2010.08.008,(-)-Hardwickiic acid,Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,openai/gpt-4.1


# We define an ensemble to be the concatenation of both models outputs

In [12]:
ensemble = expl_results.copy()
ensemble.model = 'ensemble'
ensemble = ensemble.reset_index(drop=True)
final_results = pd.concat([expl_results, ensemble], axis=0)
final_results

Unnamed: 0,doi,name,biological_activity,collection_specie,collection_type,collection_site,model
0,10.1002/cbdv.200800342,"Casearin X; (1R*,3S*,5S*,6aR*,7S*,8S*,10R*,10a...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin D; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin L; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Casearin O; (1R*,3S*,5S*,6aR*,7R*,8R*,9S*,10S*...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
0,10.1002/cbdv.200800342,"Caseargrewiin F; (1R*,3S*,5S*,6aR*,7S*,8R*,10R...",Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,google/gemini-2.5-pro-preview-03-25
...,...,...,...,...,...,...,...
3908,10.1016/j.cbi.2010.08.008,Casearin L,Anticancer,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,ensemble
3909,10.1016/j.cbi.2010.08.008,Casearin O,Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,ensemble
3910,10.1016/j.cbi.2010.08.008,Casearin O,Anticancer,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,ensemble
3911,10.1016/j.cbi.2010.08.008,(-)-Hardwickiic acid,Cytotoxic,Casearia sylvestris (Flacourtiaceae),Plant Isolated,Sao Miguel Arcanjo/SP,ensemble


In [14]:
final_results.to_parquet(r'data/processed/final_results.parquet')