In [1]:
import pandas as pd
import time
import re
from huggingface_hub import InferenceClient
from termcolor import colored
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import textwrap

In [37]:
# Read data
df_sample = pd.read_csv('../1_data/sample1000_papers.csv')
df_sample.head(2)

Unnamed: 0,ID,Title,Source title,Abstract,Author Keywords,selected,human_labeled,selected_llm
0,72037,How Germany is phasing out lignite: insights f...,"Energy, Sustainability and Society",Background: This article asks the following qu...,Coal Commission; Energy justice; Energy transi...,no,1,no
1,40296,Adam Smith’s Theory of Prudence Updated with N...,Neuroethics,"Other-perspective taking (OPT), distancing, ti...",Distancing; Episodic future thinking; Loss ave...,yes,1,yes


In [3]:
# Count the instances of selected
df_sample["selected"].value_counts()

selected
no     265
yes     36
Name: count, dtype: int64

In [12]:
client = InferenceClient(token="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",
                         model="meta-llama/Meta-Llama-3.1-70B-Instruct",
                         headers={"X-use-cache": "false"})

def correct_llama(words, sleep=0.5, system="", instruct=""):
    template = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|><|start_header_id|>user<|end_header_id|>{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    
    responses = []
    for i, word in enumerate(words):
        time.sleep(sleep)
        user = instruct.format(word=word)
        prompt = template.format(system=system, user=user)
        
        out = None
        error_sum = 0
        while out is None:
            try:
                out = client.text_generation(prompt, max_new_tokens=250, temperature = 0.001, do_sample = False)
            except Exception as e:
                print("\n\nsleeping 5\n\n")
                print (str(e))
                time.sleep(5)
                if "You reached PRO hourly usage limit" in str(e):
                    print("Sleeping at", time.strftime("%Y-%m-%d %H:%M:%S"))
                    time.sleep(60 * 60)
                out = None
            if out is not None and not re.findall(r'([A-Za-z]+.*)\*(yes|no*?)\*', out.lower()):
                error_sum += 1
                if error_sum > 10:
                    print("Too many errors, skipping")
                    out = "ERROR: " + out
                    break
                out = None
            
        
        responses.append(out)
        print(colored(f"{i} - {word}", "red"), ":\n\n", re.sub(r"\n", "", out), "\n\n\n")
    
    return responses


system = """You are a helpful assistant with the task of selecting (yes/no) scientific articles in the context of losses in decision-making"""
instruct = """Based on the next definitions, select articles that study losses in the context of decision-making.
Definition Losses: A negative outcome or detrimental event perceived or experienced by an individual. Whether an outcome or event is negative or detrimental and represents a loss ultimately depends on the exposition of the authors. 
Decision-making: The process by which an individual evaluates options and makes choices among them.
Decision-making includes but is not limited to:
- Deciding between options
- Evaluating or judging options
- Assessing risks and benefits
- Weighing options

Inclusion Criteria:
- Loss Focus: The article must explicitly discuss losses and their role in decision-making as an important theme.
- Individual Level: The losses considered must relate to individuals’ perceptions and experiences.

Exclusion Criteria:
- Irrelevant Contexts: Articles that mention "loss" in contexts unrelated to individual decision-making, such as technical discussions on loss functions in statistical models.
- Secondary treatment of losses: Articles in which losses are a tangential or unimportant subject matter.

Please, for the next title and abstract, reason why the article should or should not be selected following the criteria. Follow this format: Reason: *Reason*\nAnswer selected: *yes* or *no*. \n\n

{word}"""

# Create the list with the desired format only if 'selected' is 'yes'
candidates = [f"Title: {row['Title']}\n Abstract: {row['Abstract']}" 
               for index, row in df_sample.iterrows() if index < 301]
llama_resp = correct_llama(candidates, sleep = .1, system = system, instruct = instruct)

[31m0 - Title: How Germany is phasing out lignite: insights from the Coal Commission and local communities
 Abstract: Background: This article asks the following question: how well are coal regions, affected by phase-out plans, represented in mediating commissions, to what extent do local communities participate in the decision-making process and how are the political negotiations perceived by the communities? We look at the case of the German lignite phase-out from a procedural justice perspective. Informed by literature on sociotechnical decline and procedural justice in energy transitions, we focus first on aspects of representation, participation and recognition within the German Commission on Growth, Structural Change and Employment (“Coal Commission”). Second, we analyze how to exnovate coal in two regions closely tied to the coal- and lignite-based energy history in Germany: Lusatia and the Rhenish Mining District. Results: Based on interview series in both regions, we connect 

In [13]:
from collections import Counter
llama_resp_clean = []
for i, resp in enumerate(llama_resp):
    match = re.findall(r'\*(yes|no)\*', resp.lower())
    if match:
        llama_resp_clean.append(match[0].lower())
    else:
        llama_resp_clean.append("error")
        print(f"Error in {i + 1}")
        print(f"Title: {df_sample.loc[i, "Title"]}")
        print(f"Abstract: {textwrap.fill(df_sample.loc[i, "Abstract"], width=150)}")
        print(llama_resp[i])
        print("-----------------------------")

print(len(llama_resp_clean), len(llama_resp))
Counter(llama_resp_clean)


Error in 279
Title: Comparison of angular frequency contrast sensitivity in young and older adults
Abstract: The aim of the present study was to measure contrast sensitivity curves for angular frequencies in the range between 2 and 96 cycles/360° in older
human adult volunteers and to compare these measurements with the more usual contrast sensitivity functions for sine-wave gratings. All subjects were
free of identifiable ocular disease and had normal acuity. We measured the contrast thresholds for young adults (N = 6; age range, 20-26 years) and
older adults (N = 6; age range, 60-67 years) using the psychophysical forced-choice method. In this paradigm the volunteers had to choose the stimulus
containing a test frequency at low contrast (e.g., either a sine-wave grating or an angular frequency stimulus), or another neutral stimulus at mean
luminance (without any contrast). Older adults presented a loss in contrast sensitivity at high and medium angular frequencies compared to the you

Counter({'no': 263, 'yes': 37, 'error': 1})

In [25]:
df_sample.loc[df_sample["human_labeled"] == 1, "selected_llm"] = llama_resp_clean
df_sample.loc[278, "selected_llm"] = "no"
Counter(df_sample.selected_llm)


Counter({'no': 892, 'yes': 108})

In [45]:
# Convert "yes"/"no" to 1/0 using LabelEncoder
le = LabelEncoder()
human_labels = le.fit_transform(df_sample.loc[df_sample["human_labeled"] == 1, "selected"])  # Encodes 'yes' as 1 and 'no' as 0
llm_labels = le.transform(df_sample.loc[df_sample["human_labeled"] == 1, "selected_llm"])

# Calculate the metrics
precision = precision_score(human_labels, llm_labels)
recall = recall_score(human_labels, llm_labels)
f1 = f1_score(human_labels, llm_labels, average='weighted')
accuracy = accuracy_score(human_labels, llm_labels)
conf_matrix = confusion_matrix(human_labels, llm_labels)

# Output the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.89
Recall: 0.87
F1-Score: 0.97
Accuracy: 0.97
Confusion Matrix:
[[259   4]
 [  5  33]]


In [39]:
for i in range(len(df_sample)):
    if df_sample.loc[i, "selected"] != df_sample.loc[i, "selected_llm"] and df_sample.loc[i, "human_labeled"] == 1:
        print(f"Index: {i}")
        print(f"Title: {df_sample.loc[i, "Title"]}")
        print(f"Abstract: {textwrap.fill(df_sample.loc[i, "Abstract"], width=150)}")
        print()
        print("Human: ", df_sample.loc[i, "selected"], "LLM: ", df_sample.loc[i, "selected_llm"])
        print(f"Reason: {textwrap.fill(llama_resp[i], width=150)}")
        print("-----------------------------")

Index: 104
Title: Maximum-loss, minimum-win and the Esscher pricing principle
Abstract: Maximum-loss (Max-loss) was recently introduced as a valuation functional in the context of systematic stress testing. The basic idea is to value a
(financial) random variable by its worst case expectation, where the most unfavourable probability measure - the 'worst case distribution' - lies
within a given Kullback-Leibler radius around a previously estimated distribution. The article gives an overview of the properties of this measure and
analyses relations to other risk and acceptability measures and to the well-known Esscher pricing principle, used in insurance mathematics and option
pricing. The main part of the article focuses then on optimal decision-making - in particular related to portfolio optimization - with Max-loss as the
objective function to be minimized. A simple algorithm for dealing with the resulting saddle point problem is introduced and analysed. The authors
2011. Published by 

In [28]:
pd.DataFrame(llama_resp).to_csv("../1_data/reasons_normalization_papers.csv", sep=',',index=False)

In [40]:
# Predict for the remaining articles
candidates = [f"Title: {row['Title']}\n Abstract: {row['Abstract']}" 
               for index, row in df_sample.iterrows() if row['human_labeled'] == 0]
llama_resp_rest = correct_llama(candidates, sleep = .1, system = system, instruct = instruct)

[31m0 - Title: Thermoplastics show their true mettle as shafts for golf clubs
 Abstract: Perfecting one's swing has long been the golfer's dream. Thermo-Plastic carbon composite golf club shafts now meet the player halfway-placing a larger range of swing styles in the "perfect" category. When selecting a club from among the woods and irons, today's golfer has a new option- thermoplastics. Manufacture of the Thermo-Composite™ golf club shaft begins at the Taunton, Massachusetts, facility of Phoenixx TPC with a prepreg tape that has the graphite:resin ratio of 65:35. Phoenixx uses either linear polyphenylene sulphide (PPS) - Fortron® 0214B1 (by Ticona of Summit, New Jersey) - or polyamide 6 as the matrix resin. Various suppliers, including Hexcel and Toray, provide the carbon fibre reinforcement. Phoenixx, in turn, provides the thermoplastic prepreg tape to makers of industrial components-blades and vanes, bicycle components, and other products. When it began making the shafts in 1998, 

In [41]:
llama_resp_rest_clean = []
for i, resp in enumerate(llama_resp_rest):
    match = re.findall(r'\*(yes|no)\*', resp.lower())
    if match:
        llama_resp_rest_clean.append(match[0].lower())
    else:
        llama_resp_rest_clean.append("error")
        print(f"Error in {i + 1}")
        print(f"Title: {df_sample.loc[i, "Title"]}")
        print(f"Abstract: {textwrap.fill(df_sample.loc[i, "Abstract"], width=150)}")
        print(llama_resp[i])
        print("-----------------------------")

print(len(llama_resp_rest_clean), len(llama_resp_rest))
Counter(llama_resp_rest_clean)

699 699


Counter({'no': 611, 'yes': 88})

In [42]:
df_sample.loc[df_sample["human_labeled"] == 0, "selected_llm"] = llama_resp_rest_clean

In [43]:
# Count the instances of selected
df_sample["selected_llm"].value_counts()

selected_llm
no     875
yes    125
Name: count, dtype: int64

In [44]:
df_sample.to_csv('../1_data/sample1000_papers.csv', index=False)