In [31]:
import pandas as pd
import requests
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, cross_validate
import textwrap
import ast

In [3]:
df_sample = pd.read_csv('../1_data/sample1000_papers_embeddings.csv')
df_sample.head()

Unnamed: 0,ID,Title,Source title,Abstract,Author Keywords,selected,human_labeled,selected_llm,text,embeddings
0,72037,How Germany is phasing out lignite: insights f...,"Energy, Sustainability and Society",Background: This article asks the following qu...,Coal Commission; Energy justice; Energy transi...,no,1,no,Title: How Germany is phasing out lignite: ins...,"[-0.005838972516357899, -0.08225536346435547, ..."
1,40296,Adam Smith’s Theory of Prudence Updated with N...,Neuroethics,"Other-perspective taking (OPT), distancing, ti...",Distancing; Episodic future thinking; Loss ave...,yes,1,yes,Title: Adam Smith’s Theory of Prudence Updated...,"[-0.05494213104248047, -0.24082590639591217, 0..."
2,18690,The effects of personal experience on choice-b...,International Journal of Wildland Fire,"In this paper, we investigate homeowner prefer...",expected utility; heuristics; natural disaster...,yes,1,yes,Title: The effects of personal experience on c...,"[0.09921591728925705, 0.5046013593673706, -0.0..."
3,21665,"Multi-class HingeBoost*, method and applicatio...",Methods of Information in Medicine,Background: Multi-class molecular cancer class...,Boosting; Classification; Regression trees; Sm...,no,1,no,"Title: Multi-class HingeBoost*, method and app...","[0.10821573436260223, 0.10324213653802872, 0.1..."
4,43401,Oesophageal biopsies are insufficient to predi...,United European Gastroenterology Journal,Background: Endoscopic resection (ER) with or ...,Accuracy; Barrett; biopsy; histology,no,1,no,Title: Oesophageal biopsies are insufficient t...,"[0.21208856999874115, -1.1365407705307007, 0.2..."


In [6]:
# Create a new column concatenating the title and abstract
df_sample['text'] = "Title: " + df_sample['Title'] + '\nAbstract: ' + df_sample['Abstract']

In [None]:
API_URL = "https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1"
# headers = {"Authorization": "Bearer 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"}
headers = {"Authorization": "Bearer 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"}

def get_embedding(text):
    while True:
        try:
            response = requests.post(API_URL, headers=headers, json={"inputs": text})
            response.raise_for_status()
            output = response.json()
            return output
        except Exception as e:
            if response.status_code == 503:
                print(f"Request error: {e}\nRetrying in 20 seconds...")
                time.sleep(5)
            elif response.status_code == 429:
                print(f"Request error: {e}\nSleeping 1hour...")
                time.sleep(60*60)
        except ValueError as e:
            print(f"JSON decode error: {e}. Retrying in 5 seconds...")

In [58]:
# Get embeddings and save
df_sample['embeddings'] = df_sample["text"].apply(lambda x: get_embedding(x))
df_sample.to_csv('../1_data/sample1000_papers_embeddings.csv', index=False)

In [None]:
# Read the whole dataset
df = pd.read_csv('../1_data/all_papers_2024-10-02.csv')

# Drop the columns that are not needed
df = df.drop(['Source title', 'Author Keywords'], axis=1)
# Create the text
df['text'] = "Title: " + df['Title'] + '\nAbstract: ' + df['Abstract']
print(df.shape)
    
# Keep only the papers that are not in the sample
df_nosample = df[~df.ID.isin(df_sample.ID.values)]

In [None]:
# Get embeddings for the whole dataset in batches and save them
batch_size = 300
embeddings = []
starting_batch = 118
for i in range(starting_batch*batch_size, len(df_nosample), batch_size):
    print(f"Processing batch {i//batch_size} from {i} to {i+batch_size}")
    batch_embeddings = df_nosample["text"].iloc[i:i+batch_size].apply(lambda x: get_embedding(x)).tolist()
    df_nosample.iloc[i:i+batch_size, df_nosample.columns.get_loc('embeddings')] = pd.Series(batch_embeddings, index=df_nosample.index[i:i+batch_size])
    df_nosample.to_csv('../1_data/all_papers_2024-10-02_embeddings.csv', index=False)


Processing batch 118 from 35400 to 35700
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1
Retrying in 20 seconds...
Request error: 503 Server Err

# Read the already created dataset
If you already obtained embeddings and only need to read:

In [5]:
df_nosample = pd.read_csv('../1_data/all_papers_2024-10-02_embeddings.csv')
df_nosample['embeddings'] = df_nosample['embeddings'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
df_sample = pd.read_csv('../1_data/sample1000_papers.csv')
df_sample2 = pd.read_csv('../1_data/sample1000_papers_embeddings.csv', usecols=['ID', 'embeddings'])
df_sample2['embeddings'] = df_sample2['embeddings'].apply(lambda x: ast.literal_eval(x))
df_sample = pd.merge(df_sample, df_sample2, on='ID')
df_sample2 = None

    
# Print how many embeddings are not nan
print(df_nosample['embeddings'].apply(lambda x: not np.isnan(x).all()).sum())

90047


In [25]:
# RIDGE REGRESSION
# Nested cross-validation training
X = np.array(df_sample['embeddings'].tolist())
y = LabelEncoder().fit_transform(df_sample['selected_llm'])

# Define the cross-validation strategy
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Set up the Ridge Classifier and hyperparameter grid
ridge_clf = RidgeClassifier()
param_grid = {'alpha': np.logspace(-3, 3, 20)}

# Nested cross-validation for hyperparameter tuning and model evaluation
grid_search = GridSearchCV(estimator=ridge_clf, param_grid=param_grid, cv=inner_cv, scoring='accuracy')
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

print("Nested CV Mean Accuracy:", nested_scores.mean())
print("Nested CV Accuracy Std Dev:", nested_scores.std())

# Example of finding the best alpha from nested CV
best_alphas = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]
    
    grid_search.fit(X_train_fold, y_train_fold)
    best_alphas.append(grid_search.best_params_['alpha'])

# Choose the most common best alpha (or average if necessary)
final_alpha = np.median(best_alphas)  # or use any preferred selection method

print("Best alpha found from nested CV:", final_alpha)


Nested CV Mean Accuracy: 0.913
Nested CV Accuracy Std Dev: 0.02315167380558044
Best alpha found from nested CV: 233.57214690901213


In [35]:
final_model = RidgeClassifier(alpha=final_alpha)

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X, y, cv=outer_cv, scoring=scoring)

# Print the results
print("\nCross-Validation Results:")
print("Precision: ", cv_results['test_precision'].mean())
print("Recall: ", cv_results['test_recall'].mean())
print("F1 Score: ", cv_results['test_f1'].mean())
print("Accuracy: ", cv_results['test_accuracy'].mean())



Cross-Validation Results:
Precision:  0.9079968082055544
Recall:  0.9110000000000001
F1 Score:  0.8962915805234374
Accuracy:  0.9110000000000001


In [37]:
# Train the model
final_model.fit(X, y)

# Predict on all papers for those that have embeddings
X_all = np.array(df_nosample[df_nosample['embeddings'].notna()]['embeddings'].tolist())
y_all = final_model.predict(X_all)

# Add predictions to the dataframe
df_nosample.loc[df_nosample['embeddings'].notna(), 'prediction'] = y_all

In [38]:
# Count predictions
df_nosample['prediction'].value_counts()

prediction
0.0    84354
1.0     5693
Name: count, dtype: int64

In [39]:
# Check a random sample of Abstracts from selected papers
sample_abstracts = df_nosample.loc[df_nosample['prediction'] == 1, "Abstract"].sample(5)

for i, abstract in enumerate(sample_abstracts):
    print(f"Abstract {i+1}:\n")
    print(textwrap.fill(abstract, width=150))
    print("\n\n")

Abstract 1:

Claims that a choice cannot be repeated or that its consequences may involve an irreparable loss draw upon powerful topical and ontological
assumptions. This essay identifies this commonplace with certain loci of "Quality" and with a view of time similar to Heidegger’s general
interpretation of human existence as Ek-sistenz- "the standing beyond oneself." Such an understanding reveals strategic and ethical implications of
the irreparable for human decision-making. © 1982 Taylor & Francis Group, LLC.



Abstract 2:

One construct validation study and four experiments showed that the relationship between hope and financial risk seeking depended on whether or not
the possibility of a hoped-for outcome was threatened. Whereas high (vs. low) hope decreased financial risk seeking when the possibility of a hoped-
for outcome was not threatened, high (vs. low) hope increased financial risk seeking when the outcome's possibility was threatened. These effects were
observed in differ

In [40]:
# Save selected papers
all_df = pd.concat([df_sample, df_nosample], ignore_index=True)
df_selected = all_df[(all_df["prediction"] == 1) | (all_df["selected"] == 'yes') | (all_df["selected_llm"] == 'yes')]
df_selected.to_csv('../1_data/selected_papers.csv', index=False)