# Evaluation of LLM-generated metadata
Models in question:
- sonar-pro 
- llama-3.1-sonar-small-128k-online
- mistral-small

In [19]:
import pandas as pd

# Load data
df = pd.read_csv("llm_output.csv", header=None)

In [20]:
# Add header
df.columns = ["lodAIC", "model", "prompt", "response", "dateTime", "stop reason"]

In [21]:
df[14:19]

Unnamed: 0,lodAIC,model,prompt,response,dateTime,stop reason
14,pid_graph:APA44A99E4,mistral-small,**Instruction**: You are ...,"Bitbucket: Community Governance, 0.9 Explanat...",2025-03-03 14:34:37,stop
15,pid_graph:AP47CC0B91,mistral-small,**Instruction**: You are a...,"Book ID: Metadata Schema per Entity, 0.8 Expl...",2025-03-03 14:34:39,stop
16,pid_graph:APD064DE2D,mistral-small,**Instruction**: You are a...,"""no response """,2025-03-03 14:34:41,stop
17,pid_graph:AP66BC0F2F,mistral-small,**Instruction**: You are ...,"Book ID: Unknown, 0.6 I was unable to find sp...",2025-03-03 14:34:43,stop
18,pid_graph:AP685862BE,mistral-small,**Instruction**: You are a...,"BusinessID: Unknown, 0.8 Explanation: After ...",2025-03-03 14:34:45,stop


In [22]:
df = df.replace("llama-3.1-sonar-small-128k-online", "llama-online")

In [23]:
df['prompt'] = df['prompt'].str.strip()  # removes leading/trailing whitespace



### Add property variable 

In [24]:
props = ["metadata", "structure", "governance"]

In [25]:
# Assign the property values to the new column
df['property'] = [props[i % len(props)] for i in range(len(df))]
 

In [26]:
df

Unnamed: 0,lodAIC,model,prompt,response,dateTime,stop reason,property
0,pid_graph:AP0AEB083B,mistral-small,**Instruction**: You are an informat...,"AID: No Metadata Schema Prescribed, confidence...",2025-03-03 14:34:02,stop,metadata
1,pid_graph:AP7D4E5ACB,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.5 Explanation: The Persisten...",2025-03-03 14:34:04,stop,structure
2,pid_graph:APE39D4FF0,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.9 Explanation: The Persisten...",2025-03-03 14:34:06,stop,governance
3,pid_graph:AP9364E21B,mistral-small,**Instruction**: You are an informat...,"ARK: Metadata Schema per Entity, <confidence: ...",2025-03-03 14:34:11,stop,metadata
4,pid_graph:AP9DEF623A,mistral-small,**Instruction**: You are an informat...,"ARK: Allows Managed Prefix, High Confidence E...",2025-03-03 14:34:13,stop,structure
...,...,...,...,...,...,...,...
1183,pid_graph:AP8AC7F009,llama-online,**Instruction**: You are an informat...,"**Z39.88-2004: No Prefix, 100%** The identifi...",2025-03-04 13:39:08,stop,structure
1184,pid_graph:APCF11C735,llama-online,**Instruction**: You are an informat...,"Z39.88-2004: **Community Governance**, **High*...",2025-03-04 13:39:11,stop,governance
1185,pid_graph:APDB82E23B,llama-online,**Instruction**: You are an informat...,**zbMath: 2. Common Metadata Schema for Identi...,2025-03-04 13:39:14,stop,metadata
1186,pid_graph:AP4A7794ED,llama-online,**Instruction**: You are an informat...,Given the Persistent Identifier (PID) structur...,2025-03-04 13:39:15,stop,structure


In [27]:
# Remove nan values
df = df.dropna()

In [28]:
df[14:19]

Unnamed: 0,lodAIC,model,prompt,response,dateTime,stop reason,property
14,pid_graph:APA44A99E4,mistral-small,**Instruction**: You are an informat...,"Bitbucket: Community Governance, 0.9 Explanat...",2025-03-03 14:34:37,stop,governance
15,pid_graph:AP47CC0B91,mistral-small,**Instruction**: You are an informat...,"Book ID: Metadata Schema per Entity, 0.8 Expl...",2025-03-03 14:34:39,stop,metadata
16,pid_graph:APD064DE2D,mistral-small,**Instruction**: You are an informat...,"""no response """,2025-03-03 14:34:41,stop,structure
17,pid_graph:AP66BC0F2F,mistral-small,**Instruction**: You are an informat...,"Book ID: Unknown, 0.6 I was unable to find sp...",2025-03-03 14:34:43,stop,governance
18,pid_graph:AP685862BE,mistral-small,**Instruction**: You are an informat...,"BusinessID: Unknown, 0.8 Explanation: After ...",2025-03-03 14:34:45,stop,metadata


## Extract PIDs from prompt text 

In [29]:
prompts = df.prompt.tolist()

In [30]:

for prompt in prompts: 
    if type(prompt) != str: 
        print(prompt)
        print(type(prompt))
        print("not a string")
        print("")

In [31]:
def get_pid(prompt): 
    front = prompt.split(" **Input**:")
    back = front[-1]
    target = back.split(" **Output**:")
    pid = target[0]
    return pid

In [32]:
# Extract all pids from the prompts
pids = [get_pid(prompt) for prompt in prompts]

In [33]:
# Add them to new dataframe column
df["pid"] = pids

In [34]:
df

Unnamed: 0,lodAIC,model,prompt,response,dateTime,stop reason,property,pid
0,pid_graph:AP0AEB083B,mistral-small,**Instruction**: You are an informat...,"AID: No Metadata Schema Prescribed, confidence...",2025-03-03 14:34:02,stop,metadata,AID
1,pid_graph:AP7D4E5ACB,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.5 Explanation: The Persisten...",2025-03-03 14:34:04,stop,structure,AID
2,pid_graph:APE39D4FF0,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.9 Explanation: The Persisten...",2025-03-03 14:34:06,stop,governance,AID
3,pid_graph:AP9364E21B,mistral-small,**Instruction**: You are an informat...,"ARK: Metadata Schema per Entity, <confidence: ...",2025-03-03 14:34:11,stop,metadata,ARK
4,pid_graph:AP9DEF623A,mistral-small,**Instruction**: You are an informat...,"ARK: Allows Managed Prefix, High Confidence E...",2025-03-03 14:34:13,stop,structure,ARK
...,...,...,...,...,...,...,...,...
1183,pid_graph:AP8AC7F009,llama-online,**Instruction**: You are an informat...,"**Z39.88-2004: No Prefix, 100%** The identifi...",2025-03-04 13:39:08,stop,structure,Z39.88-2004
1184,pid_graph:APCF11C735,llama-online,**Instruction**: You are an informat...,"Z39.88-2004: **Community Governance**, **High*...",2025-03-04 13:39:11,stop,governance,Z39.88-2004
1185,pid_graph:APDB82E23B,llama-online,**Instruction**: You are an informat...,**zbMath: 2. Common Metadata Schema for Identi...,2025-03-04 13:39:14,stop,metadata,zbMath
1186,pid_graph:AP4A7794ED,llama-online,**Instruction**: You are an informat...,Given the Persistent Identifier (PID) structur...,2025-03-04 13:39:15,stop,structure,zbMath


In [293]:
#df.to_csv("llm_output_clean.csv")

## Extract all responses

In [35]:
# Extract all answers from the responses
def get_answer(row): 
    prop = row["property"]
    response = row["response"]

    if prop == "governance":
        options = ["Community Governance", "Membership Governance", "Closed Governance", "Unknown"] 
    
    elif prop == "metadata": 
        options = ["No Metadata Schema Prescribed", "Common Metadata Schema for Identifier", "Metadata Schema per Entity", "Custom/Non-Standard Metadata Schema", "Unknown" ]

    elif prop == "structure":
        options = ["Allows User Semantics", "Allows Managed Prefix", "No Prefix" , "Predefined Identifier Structure", "Unknown"]
    

    # Find the option that appears first in the response
    first_option = None
    first_index = len(response)  # Initialize with a value larger than any possible index

    for option in options: 
        index = response.find(option)
        if index != -1 and index < first_index:
            first_index = index
            first_option = option
            
    if first_option == None: 
        first_option = "Not found"

    return first_option


In [36]:
responses = df.response.tolist()
print(len(responses))

1188


In [37]:

for prompt in responses: 
    if type(prompt) != str: 
        print(prompt)
        print(type(prompt))
        print("not a string")
        print("")

In [38]:
filtered_answers = df.apply(get_answer, axis=1)

In [39]:
c = 2
for ans in filtered_answers[:5]: 
    print(c, ans) 
    c += 1    

2 No Metadata Schema Prescribed
3 Unknown
4 Unknown
5 Metadata Schema per Entity
6 Allows Managed Prefix


In [40]:
df["short_response"] = filtered_answers

In [41]:
df

Unnamed: 0,lodAIC,model,prompt,response,dateTime,stop reason,property,pid,short_response
0,pid_graph:AP0AEB083B,mistral-small,**Instruction**: You are an informat...,"AID: No Metadata Schema Prescribed, confidence...",2025-03-03 14:34:02,stop,metadata,AID,No Metadata Schema Prescribed
1,pid_graph:AP7D4E5ACB,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.5 Explanation: The Persisten...",2025-03-03 14:34:04,stop,structure,AID,Unknown
2,pid_graph:APE39D4FF0,mistral-small,**Instruction**: You are an informat...,"AID: Unknown, 0.9 Explanation: The Persisten...",2025-03-03 14:34:06,stop,governance,AID,Unknown
3,pid_graph:AP9364E21B,mistral-small,**Instruction**: You are an informat...,"ARK: Metadata Schema per Entity, <confidence: ...",2025-03-03 14:34:11,stop,metadata,ARK,Metadata Schema per Entity
4,pid_graph:AP9DEF623A,mistral-small,**Instruction**: You are an informat...,"ARK: Allows Managed Prefix, High Confidence E...",2025-03-03 14:34:13,stop,structure,ARK,Allows Managed Prefix
...,...,...,...,...,...,...,...,...,...
1183,pid_graph:AP8AC7F009,llama-online,**Instruction**: You are an informat...,"**Z39.88-2004: No Prefix, 100%** The identifi...",2025-03-04 13:39:08,stop,structure,Z39.88-2004,No Prefix
1184,pid_graph:APCF11C735,llama-online,**Instruction**: You are an informat...,"Z39.88-2004: **Community Governance**, **High*...",2025-03-04 13:39:11,stop,governance,Z39.88-2004,Community Governance
1185,pid_graph:APDB82E23B,llama-online,**Instruction**: You are an informat...,**zbMath: 2. Common Metadata Schema for Identi...,2025-03-04 13:39:14,stop,metadata,zbMath,Common Metadata Schema for Identifier
1186,pid_graph:AP4A7794ED,llama-online,**Instruction**: You are an informat...,Given the Persistent Identifier (PID) structur...,2025-03-04 13:39:15,stop,structure,zbMath,No Prefix


In [42]:
df = df.drop(columns=["prompt", "response", "dateTime", "stop reason"])
df.to_csv("llm_output_small.csv")

In [43]:
#df.to_csv("llm_output_rich.csv")

### Pivot table 

In [100]:
# Pivot the dataframe to have each model's response as a separate column
df_pivot = df.pivot(index=["property", "pid"], columns="model", values="short_response").reset_index()

# Rename columns for clarity (optional)
df_pivot.columns.name = None
df_pivot = df_pivot.rename(columns={
    'mistral-small': 'mistrall-small-response',
    'llama-online': 'llama-online-response',
    'sonar-pro': 'sonar-pro-response'
})


In [101]:
df_pivot

Unnamed: 0,property,pid,llama-online-response,mistrall-small-response,sonar-pro-response
0,governance,AID,Unknown,Unknown,Unknown
1,governance,ARK,Community Governance,Community Governance,Community Governance
2,governance,BibCode,Community Governance,Community Governance,Closed Governance
3,governance,Bitbucket,Unknown,Community Governance,Closed Governance
4,governance,Book ID,Community Governance,Unknown,Unknown
...,...,...,...,...,...
391,structure,ePIC,Allows Managed Prefix,Allows Managed Prefix,Allows Managed Prefix
392,structure,re3data,Predefined Identifier Structure,Not found,Allows Managed Prefix
393,structure,rs number,Allows User Semantics,Allows Managed Prefix,Predefined Identifier Structure
394,structure,swMath,Predefined Identifier Structure,Allows Managed Prefix,Unknown


### Agreement scores 
The agreement score can have a value between 1 and 3. Here's what they mean: 
- `3`: All three models give the same response
- `2`: Two models give the same response
- `1`: No models give the same response

An additional value was added: `3: Unknown`. It means that all models agreed that the property information was unknown. 

In [147]:
def get_agreement(row): 
    responses = [row.iloc[2], row.iloc[3], row.iloc[4]]
    unique_responses = set(responses)

    if len(unique_responses) == 1: 
        if "Unknown" in unique_responses: 
            return "3: Unknown"
        else:
            return 3
    if len(unique_responses) == 2: 
        return 2
    if len(unique_responses) == 3:  
        return 1

In [148]:
df_pivot["agreement"] = df_pivot.apply(get_agreement, axis=1) 

In [149]:
df_pivot

Unnamed: 0,property,pid,llama-online-response,mistrall-small-response,sonar-pro-response,agreement
0,governance,AID,Unknown,Unknown,Unknown,3: Unknown
1,governance,ARK,Community Governance,Community Governance,Community Governance,3
2,governance,BibCode,Community Governance,Community Governance,Closed Governance,2
3,governance,Bitbucket,Unknown,Community Governance,Closed Governance,1
4,governance,Book ID,Community Governance,Unknown,Unknown,2
...,...,...,...,...,...,...
391,structure,ePIC,Allows Managed Prefix,Allows Managed Prefix,Allows Managed Prefix,3
392,structure,re3data,Predefined Identifier Structure,Not found,Allows Managed Prefix,1
393,structure,rs number,Allows User Semantics,Allows Managed Prefix,Predefined Identifier Structure,1
394,structure,swMath,Predefined Identifier Structure,Allows Managed Prefix,Unknown,1


In [150]:
#df_pivot.to_csv("llm_evaluation.csv")

#### Agreement count

In [151]:
# Agreement count of all properties
counts = df_pivot["agreement"].value_counts() 

counts

agreement
2             218
1             142
3              28
3: Unknown      8
Name: count, dtype: int64

In [152]:
# Split the dataframe into three based on property, count the agreement values
df_governance = df_pivot[df_pivot["property"] == "governance"]
df_metadata = df_pivot[df_pivot["property"] == "metadata"]
df_structure = df_pivot[df_pivot["property"] == "structure"]

counts_governance = df_governance["agreement"].value_counts()
counts_metadata = df_metadata["agreement"].value_counts()
counts_structure = df_structure["agreement"].value_counts()

In [156]:
# Agreement count for governance property  

counts_governance

agreement
2             73
1             36
3             16
3: Unknown     7
Name: count, dtype: int64

In [157]:
# Agreement count for metadata property

counts_metadata

agreement
2    77
1    50
3     5
Name: count, dtype: int64

In [155]:
# Agreement count for structure property

counts_structure

agreement
2             68
1             56
3              7
3: Unknown     1
Name: count, dtype: int64