In [None]:
import pandas as pd
import time
import re
from huggingface_hub import InferenceClient
from termcolor import colored
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import textwrap
import threading
from tqdm.notebook import tqdm
import numpy as np
import itertools
from collections import Counter

In [None]:
# Read data
df_sample = pd.read_csv('../1_data/sample1000_papers.csv')
df_sample.head(2)

In [None]:
# Count the instances of selected
df_sample["selected"].value_counts()

In [None]:
def correct_llama_parallel(words, client, y, responses, progress_bar, system, instruct, indexes_analized, indexes_errors):
    
    # Template for the prompt to be sent to the language model.
    template = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|><|start_header_id|>user<|end_header_id|>{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    
    response = []
    consecutives_errors = 0

    indexes_viewed = []
    indexes_errors_core = []

    for i, (index, word) in enumerate(words):
        indexes_viewed.append(index)

        # Check for too many consecutive errors and interrupt if necessary (sometimes LLM model starts to hallucinate and providing non-sense responses).
        if consecutives_errors > 50:
            print("Too many consecutive errors. Interrupting...")

            responses[y] = [y, response]
            progress_bar.close()
            break

        # Format the instruction with the current word.
        instruct_w = instruct.format(word=word)
        prompt = template.format(system=system, user=instruct_w)

        out = None
        error_sum = 0

        # Retry the text generation until a valid response is received or the maximum number of retries is reached.
        while out is None:
            try:
                # Generate text using the client.
                out = client.text_generation(prompt, max_new_tokens=500, temperature=0.001, do_sample=False, top_p=0.01, top_k=1)

                match = re.findall(r'answer\**\s*\n*[=|:]*\s*\**(included|excluded)\**', out.lower())
                

            except Exception as e:
                # Handle rate limit and other exceptions.
                if "Rate limit reached." in str(e) or "Max retries exceeded with url" in str(e):
                    print(str(e))
                    print("Sleeping 10 min at", time.strftime("%Y-%m-%d %H:%M:%S"))
                    time.sleep(600)
                else:
                    print("\n\nSleeping 5 seconds\n\n")
                    print(str(e))
                    time.sleep(5)
                out = None

            # If the response is not valid, retry.
            if out is not None and not match:
                error_sum += 1
                
                # If too many errors have occurred, skip the current word.
                if error_sum > 5:
                    print("Too many errors, skipping")
                    out = "ERROR: " + out
                    print(out)
                    break

                out = None


        # Update the consecutive error count.
        if "ERROR" in out:
            consecutives_errors += 1
        else:
            consecutives_errors = 0

        response.append(out)
        progress_bar.update(1)
    
    # Store the indexes of analyzed and erroneous words.
    indexes_errors[y] = indexes_errors_core
    indexes_analized[y] = indexes_viewed
    responses[y] = [y, response]
    progress_bar.close()

system = """You are a helpful assistant with the task of deciding whether or not to select scientific articles in the context of losses in decision-making \n\n""" 

instruct = """Based on the following definitions, criteria, rules, and reasoning instructions, decide whether to include or exclude the following article.


*Definitions*
- Losses: A negative outcome or detrimental event perceived or experienced by an individual. Whether an outcome or event is negative or detrimental and represents a loss ultimately depends on the authors' exposition. 
- Decision-Making: The process by which an individual evaluates options and makes choices among them. This includes but is not limited to: 
    a) Deciding between options
    b) Evaluating or judging options
    c) Assessing risks and benefits
    d) Weighing options

    
*Criteria*
- Loss Focus: The article explicitly discusses losses and their role in decision-making as an important theme.
- Individual Level: The article discusses losses related to individual/s' perceptions and experiences.
- Irrelevant Contexts: The article mentions "loss" in contexts unrelated to individual decision-making, such as technical discussions on loss functions in statistical models. 
- Secondary Treatment of Losses: The article discusses losses but they are tangential or unimportant for the article's subject matter.
- Secondary Treatment of Decision-Making: The article mentions decision-making but does NOT explicitly analyze it.

*Rules*

- Inclusion rule: Include the article if the definition of Loss Focus and Individual Level are met.
- Exclusion rule: Exclude the article if 1) it does not meet Loss Focus and Individual Level criteria, OR 2) if it meets ANY of the following criteria: Irrelevant Contexts, Secondary Treatment of Losses, or Secondary Treatment of Decision-Making.

*Reasoning steps*

Step 1. Evaluate Definitions: Assess whether the article meets the definitions.
Step 2. Apply Criteria: Analyze inclusion and exclusion criteria in the context of Step 1.
Step 3. Determine Outcome: Decide whether to include or exclude the article based on Step 2.
Step 4. Response Format: Return your response in the *exact* following format: Answer=*included* or *excluded*

Scientific article:
{word}\n\n"""

In [None]:
def split_list(lst, n_parts):
    n = len(lst)
    k = n // n_parts  
    r = n % n_parts 

    parts = []
    start = 0
    for i in range(n_parts):
        end = start + k + (1 if i < r else 0)
        parts.append(lst[start:end])
        start = end
    return parts

In [None]:
threads = []
workers = 5
n_responses = workers
llama_resp = [[] for _ in range(workers)]
indexes_analized = [[] for _ in range(workers)]
indexes_errors = [[] for _ in range(workers)]
candidates_total = [(i, f"Title: {row['Title']}\nAbstract: {row['Abstract']}") for i, row in df_sample.iterrows()]
candidates = split_list(candidates_total, workers)

# Create as many clients as workers and as many workers as tokens you have and differents runs oyu want
client1 = InferenceClient(token=TOKEN1,
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})
client2 = InferenceClient(token=TOKEN2,
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})
client3 = InferenceClient(token=TOKEN3,
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})
client4 = InferenceClient(token=TOKEN4,
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})
client5 = InferenceClient(token=TOKEN5,
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})

progress_bars = [tqdm(total=len(candidates[j]), desc=f"Progress {j}", leave=True) for j in range(len(candidates))]


for y in range(workers):
    thread = threading.Thread(target=correct_llama_parallel, args=(candidates[y], eval(f'client{y+1}'), y, llama_resp, progress_bars[y], system, instruct, indexes_analized, indexes_errors))
    thread.start()
    threads.append(thread)


for thread in threads:
    thread.join()


llama_resp_no_core = []
for resp in llama_resp:
    llama_resp_no_core.append(resp[1])

llama_resp_no_core = list(itertools.chain.from_iterable(llama_resp_no_core))

Lets clean the answer and find the errors.

In [None]:
llama_resp_clean = []
errors = []

for i, resp in enumerate(llama_resp_no_core):
    if "LAST ERROR" in resp:
        llama_resp_clean.append("error")
        errors.append(i)
    else:
        match = re.findall(r'answer\**\s*\n*[=|:]*\s*\**(included|excluded)\**', resp.lower())
        if match:
            llama_resp_clean.append(match[0].lower())

    
print(Counter(llama_resp_clean))
print(errors)
print(len(errors))

Correct errors

In [None]:
candidates_errors = split_list([(index, f"Title: {row['Title']}\n Abstract: {row['Abstract']}") for index, row in df_sample.iterrows() if index in errors], workers)
corrected_errors = [[] for _ in range(workers)]
indexes_analized = [[] for _ in range(workers)]
indexes_errors = [[] for _ in range(workers)]

progress_bars = [tqdm(total=len(candidates_errors[j]), desc=f"Progress {j}", leave=True) for j in range(len(candidates_errors))]


for y in range(workers):
    thread = threading.Thread(target=correct_llama_parallel, args=(candidates_errors[y], eval(f'client{y+1}'), y, corrected_errors, progress_bars[y], system, instruct, indexes_analized, indexes_errors))
    thread.start()
    threads.append(thread)


for thread in threads:
    thread.join()

# Flatten the list of errors
corrected_errors_no_core = []
for error in corrected_errors:
    corrected_errors_no_core.append(error[1])

corrected_errors_no_core = list(itertools.chain.from_iterable(corrected_errors_no_core))

for i, error in enumerate(errors):
    llama_resp_no_core[error] = corrected_errors_no_core[i]

In [None]:
df_sample["selected_llm"] = llama_resp_clean
df_sample

In [None]:
df_sample.to_csv("../1_data/sample1000_papers.csv", index=False)
pd.DataFrame(llama_resp).to_csv("../1_data/reasons_normalization_papers.csv", sep=',',index=False)

In [None]:
# Encodes 'yes' as 1 and 'no' as 0
le = LabelEncoder()
human_labels = le.fit_transform(df_sample.loc[(df_sample["human_labeled"] == 1), "selected"])
llm_labels = le.transform(df_sample.loc[(df_sample["human_labeled"] == 1), "selected_llm"])

In [None]:
# Calculate the metrics
precision = precision_score(human_labels, llm_labels)
recall = recall_score(human_labels, llm_labels)
f1 = f1_score(human_labels, llm_labels, average='weighted')
accuracy = accuracy_score(human_labels, llm_labels)
conf_matrix = confusion_matrix(human_labels, llm_labels)
conf_matrix_df = pd.DataFrame(conf_matrix, index=["Human: no", "Human: yes"], columns=["LLM: no", "LLM: yes"])


# Output the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix_df)

In [None]:
error = 0
for i in range(len(df_sample)):
    if df_sample.loc[i, "selected"] != df_sample.loc[i, "selected_llm"] and df_sample.loc[i, "human_labeled"] == 1:
        print(f"Error {error + 1}:")
        print(f"Index: {i}")
        print(f"Title: {df_sample.loc[i, "Title"]}")
        print(f"Abstract: {textwrap.fill(df_sample.loc[i, "Abstract"], width=150)}")
        print()
        print("Human: ", df_sample.loc[i, "selected"], "LLM: ", df_sample.loc[i, "selected_llm"])
        print(f"Reason LLM: {textwrap.fill(llama_resp[i], width=150)}\n")
        print("-----------------------------")
        error += 1

In [None]:
# Replace 'yes' for included and 'no' for excluded in selected_llm
df_sample.loc[df_sample["human_labeled"] == 0, "selected_llm"] = df_sample.loc[df_sample["human_labeled"] == 0, "selected_llm"].replace({"included": "yes", "excluded": "no"})

In [None]:
# Count the instances of selected
df_sample["selected_llm"].value_counts()

In [None]:
# Save df_sample
df_sample.to_csv("../1_data/sample1000_papers.csv", index=False)