In order to generate a sample of the "cleaned" dataset (no metadata, duplicate observations, outliers etc) I print it on a txt file.

In [14]:
import pandas as pd
import numpy as np

from llama_cpp import Llama

In [2]:
#obtain list of traits

with open('./support_files/trait_id_list.txt', 'r') as openfile:
    trait_ids = openfile.read().splitlines()

print(f"There are {len(trait_ids)} traits in total\n")

#mapping them to int because they were read as strings
trait_ids = list(map(lambda x : int(x), trait_ids))

print(trait_ids)

There are 16 traits in total

[3117, 13, 55, 47, 163, 50, 14, 403, 42, 3106, 33, 1111, 2809, 2807, 2808, 159]


In [3]:
#specifing encoding='latin' is necessary or you get an error
top3_species_top_traits = pd.read_csv('./support_files/top3_species_top_traits.txt', sep='\t', encoding='latin')

print("Total:", len(top3_species_top_traits))

#clear outliers: refer to the documentation obtained with the data request to better understand
#only keeping the measurement that have a value included in a range of 4 standard deviations. The rest are likely outliers or wrong.
#also keep the rows with no value in ErrorRisk, because that is metadata

#500 measurements dropped.
top3_species_top_traits = top3_species_top_traits[(top3_species_top_traits['ErrorRisk'] < 4.0) | (top3_species_top_traits['ErrorRisk'].isnull())]

print("After removing outliers:", len(top3_species_top_traits))

#now remove the duplicates. Again, refer to the documentation for more information
#4032 observations removed
top3_species_top_traits = top3_species_top_traits[top3_species_top_traits['OrigObsDataID'].isnull()]

#Note that duplicates are only referred to trait measurements.
#Therefore removing rows marked as "duplicates" might leave in the dataframe rows related to the single observation (eg. location)
#For these values, TraitName is not present. The information is located in the "DataName" column.
#If exploration is necessary, refer to the "ObservationID" of the original dataframe

print("After removing duplicates:", len(top3_species_top_traits))

#Related to what I mentioned above, now I only keep data related to the trait measurements. No metadata.
trait_measurements = top3_species_top_traits[top3_species_top_traits['TraitID'].isin(trait_ids)]

trait_measurements[:5]

  top3_species_top_traits = pd.read_csv('./support_files/top3_species_top_traits.txt', sep='\t', encoding='latin')


Total: 116532
After removing outliers: 116032
After removing duplicates: 112000


Unnamed: 0,LastName,FirstName,DatasetID,Dataset,SpeciesName,AccSpeciesID,AccSpeciesName,ObservationID,ObsDataID,TraitID,...,Replicates,StdValue,UnitName,RelUncertaintyPercent,OrigObsDataID,ErrorRisk,Reference,Comment,StdValueStr,Unnamed: 28
9,Craine,Joseph,10,Roots Of the World (ROW) Database,Dactylis glomerata,16700,Dactylis glomerata,19150,482854,13.0,...,,458.5,mg/g,,,0.776812,"Craine, J. M., W. G. Lee, W. J. Bond, R. J. Wi...",C concentration,,
15,Craine,Joseph,10,Roots Of the World (ROW) Database,Dactylis glomerata,16700,Dactylis glomerata,19194,483824,13.0,...,,416.2,mg/g,,,1.26795,"Craine, J. M., W. G. Lee, W. J. Bond, R. J. Wi...",C concentration,,
20,Craine,Joseph,10,Roots Of the World (ROW) Database,Dactylis glomerata,16700,Dactylis glomerata,19216,484308,14.0,...,,27.8,mg/g,,,1.17714,"Craine, J. M., W. G. Lee, W. J. Bond, R. J. Wi...",unadjusted N concentration,,
28,Craine,Joseph,130,Global 15N Database,Dactylis glomerata,16700,Dactylis glomerata,19514,491120,42.0,...,,,,,,,"Craine, J. M., A. J. Elmore, M. P. M. Aidar, M...",Growth form,,
29,Craine,Joseph,130,Global 15N Database,Dactylis glomerata,16700,Dactylis glomerata,19514,491121,14.0,...,,15.91335,mg/g,,,2.15244,"Craine, J. M., A. J. Elmore, M. P. M. Aidar, M...",% dry mass,,


In [4]:
# Create an explicit copy of the dataframe with selected columns
trait_measurements_1 = trait_measurements[['AccSpeciesName', 'ObservationID', 'TraitID', 'TraitName', 
                                          'DataName', 'OrigValueStr', 'StdValue', 'UnitName']].copy()

# Create a new column for combined values
# First, convert StdValue to object type to handle mixed types
trait_measurements_1['Value'] = trait_measurements_1['StdValue'].astype(object)

# Replace NaN values with corresponding OrigValueStr values
mask = trait_measurements_1['Value'].isna()
trait_measurements_1.loc[mask, 'Value'] = trait_measurements_1.loc[mask, 'OrigValueStr']

# Now identify non-numerical values in the Value column and set their UnitName to NaN
# This uses pandas' to_numeric with errors='coerce' to identify non-numeric values
non_numeric_mask = pd.to_numeric(trait_measurements_1['Value'], errors='coerce').isna()
trait_measurements_1.loc[non_numeric_mask, 'UnitName'] = np.nan

# Create the final dataframe with desired columns
trait_measurements_unified = trait_measurements_1[['AccSpeciesName', 'ObservationID', 'TraitID', 
                                                 'TraitName', 'DataName', 'Value', 'UnitName']].copy()

# Rename Value to StdValue if you want to keep the original column name
trait_measurements_unified = trait_measurements_unified.rename(columns={'Value': 'StdValue'})

trait_measurements_unified['TraitID'] = trait_measurements_unified['TraitID'].map(lambda x : int(x))

trait_measurements_unified

Unnamed: 0,AccSpeciesName,ObservationID,TraitID,TraitName,DataName,StdValue,UnitName
9,Dactylis glomerata,19150,13,Leaf carbon (C) content per leaf dry mass,Leaf carbon content per dry mass,458.5,mg/g
15,Dactylis glomerata,19194,13,Leaf carbon (C) content per leaf dry mass,Leaf carbon content per dry mass,416.2,mg/g
20,Dactylis glomerata,19216,14,Leaf nitrogen (N) content per leaf dry mass,Leaf nitrogen content per dry mass (Nmass),27.8,mg/g
28,Dactylis glomerata,19514,42,Plant growth form,Plant growth form,Herbaceous Monocot,
29,Dactylis glomerata,19514,14,Leaf nitrogen (N) content per leaf dry mass,Leaf nitrogen content per dry mass (Nmass),15.91335,mg/g
...,...,...,...,...,...,...,...
116527,Dactylis glomerata,6487798,50,Leaf nitrogen (N) content per leaf area,Leaf nitrogen content per area (Narea),1.152584,g m-2
116528,Dactylis glomerata,6487799,50,Leaf nitrogen (N) content per leaf area,Leaf nitrogen content per area (Narea),0.718069,g m-2
116529,Dactylis glomerata,6487800,50,Leaf nitrogen (N) content per leaf area,Leaf nitrogen content per area (Narea),0.836561,g m-2
116530,Dactylis glomerata,6487801,50,Leaf nitrogen (N) content per leaf area,Leaf nitrogen content per area (Narea),0.971015,g m-2


In [5]:
trait_measurements_unified.to_csv('./support_files/cleaned_top3_species_top_traits.txt', sep='\t', index=False)

#Then I will manually pick a subset of significant entries...

In [26]:
sample_measurements = pd.read_csv('.\\support_files\\SAMPLE_top3_species_top_traits.txt', sep='\t', encoding='latin')

#pd.duplicated creates a mask of duplicated rows. Pick only the subset of Obs and TraitID, and mark all duplicates as True (not only the first)
duplicates = sample_measurements[sample_measurements.duplicated(subset=['ObservationID', 'TraitID'], keep=False)]

# duplicates[:5]

grouped = duplicates.groupby(["ObservationID", "TraitID"])

print(len(grouped))
print(grouped.head())

#Show a sample group
for (obs_id, trait), group in grouped:
    print(f"ObservationID: {obs_id}, Trait: {trait}")
    print(group[["DataName", "StdValue"]])
    break  # just show one example for now

3
        AccSpeciesName  ObservationID  TraitID  \
5   Dactylis glomerata        1043810       42   
6   Dactylis glomerata        1043810       42   
7   Dactylis glomerata        1043810       42   
8   Dactylis glomerata        1043819       42   
9   Dactylis glomerata        1043819       42   
11  Trifolium pratense         941113      403   
12  Trifolium pratense         941113      403   

                                            TraitName  \
5                                   Plant growth form   
6                                   Plant growth form   
7                                   Plant growth form   
8                                   Plant growth form   
9                                   Plant growth form   
11  Plant biomass and allometry: Shoot dry mass (p...   
12  Plant biomass and allometry: Shoot dry mass (p...   

                                             DataName StdValue UnitName  
5                                   Plant growth form    grass    

In [None]:
MODEL_PATH = "./models/Mistral-7B-Instruct.Q4_K_M.gguf"

llm = Llama(model_path=MODEL_PATH)

# Simple test prompt
response = llm("Q: What is the capital of France?\nA:", max_tokens=32)

# print(response)

print(response["choices"][0]["text"])


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (2048) -- the full capacity of the model will not be utilized


 France's capital is Paris.


In [39]:
def build_prompt(trait_name, data_names):
    """Return a prompt to select the data name (context or data description)
    that best matches the trait name, when there are multiple measurement of the same trait
    regarding a single observation.
    """

    numbered = "\n".join([f"{i+1}. {d}" for i, d in enumerate(data_names)])
    return (
        f"Given the following versions of the same trait:\n{numbered}\n\n"
        f"Which one best represents the trait name: \"{trait_name}\"?\n"
        f"Respond only with the number."
    )

In [42]:
selected_rows = []

#iterate over the rows with the same ObservationID
#group is a dataframe of all rows sharing ObsID
for obs_id, group in grouped:
    data_names = group['DataName'].to_list()
    trait_name = group['TraitName'].iloc[0] #they are all the same, the first is fine

    #if the observation has a single data name, keep it
    if len(data_names)==1:
        selected = data_names[0]
    else: 
        prompt = build_prompt(trait_name, data_names)
        print("\nPROMPT:", prompt)
        output = llm(prompt)
        print(output)
        answer = output['choices'][0]['text'].strip()
        print("ANSWER: ", answer)

        try:
            #remember that the prompt asks to return the number of the best data_name
            index = int(answer)-1
            selected = data_names[index]
        except:
            selected = None #if the answer is invalid or can't be parsed (e.g. its a string) select None

    # Add the full row with all original columns
    if selected is not None:
        match = group[group['DataName'] == selected]
        if not match.empty:
            selected_rows.append(match.iloc[0])  # take the full row
            

final_df = pd.DataFrame(selected_rows)


PROMPT: Given the following versions of the same trait:
1. Whole plant aboveground vegetative dry mass per individual plant
2. Shoot dry mass per plant

Which one best represents the trait name: "Plant biomass and allometry: Shoot dry mass (plant aboveground dry mass) per plant"?
Respond only with the number.
{'id': 'cmpl-890056d4-6b3f-465f-a365-dfba414007df', 'object': 'text_completion', 'created': 1746782647, 'model': './models/TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf', 'choices': [{'text': '', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 77, 'completion_tokens': 0, 'total_tokens': 77}}
ANSWER:  

PROMPT: Given the following versions of the same trait:
1. Plant growth form
2. growth Form
3. Plant growth form 2

Which one best represents the trait name: "Plant growth form"?
Respond only with the number.
{'id': 'cmpl-e9b52faf-a5cd-4a64-a920-e605f990dbd6', 'object': 'text_completion', 'created': 1746782648, 'model': './models/TinyLlama-1.1B-Chat-v1.0.

In [33]:
final_df