In [None]:
import pandas as pd
import requests
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, cross_validate
import textwrap
import ast
import numpy as np

In [None]:
df_sample = pd.read_csv('../1_data/sample1000_papers_embeddings.csv')
df_sample.head()

In [None]:
# Create a new column concatenating the title and abstract
df_sample['text'] = "Title: " + df_sample['Title'] + '\nAbstract: ' + df_sample['Abstract']

In [None]:
API_URL = "https://api-inference.huggingface.co/models/WhereIsAI/UAE-Large-V1"
headers = {"Authorization": "Bearer hf_"}

def get_embedding(text):
    while True:
        try:
            response = requests.post(API_URL, headers=headers, json={"inputs": text})
            response.raise_for_status()
            output = response.json()
            return output
        except Exception as e:
            if response.status_code == 503:
                print(f"Request error: {e}\nRetrying in 20 seconds...")
                time.sleep(5)
            elif response.status_code == 429:
                print(f"Request error: {e}\nSleeping 1hour...")
                time.sleep(60*60)
        except ValueError as e:
            print(f"JSON decode error: {e}. Retrying in 5 seconds...")

In [None]:
# Get embeddings and save
df_sample['embeddings'] = df_sample["text"].apply(lambda x: get_embedding(x))
df_sample.to_csv('../1_data/sample1000_papers_embeddings.csv', index=False)

In [None]:
# Read the whole dataset
df = pd.read_csv('../1_data/all_papers_2024-10-02.csv')

# Drop the columns that are not needed
df = df.drop(['Source title', 'Author Keywords'], axis=1)
# Create the text
df['text'] = "Title: " + df['Title'] + '\nAbstract: ' + df['Abstract']
print(df.shape)
    
# Keep only the papers that are not in the sample
df_nosample = df[~df.ID.isin(df_sample.ID.values)]

In [None]:
# Get embeddings for the whole dataset in batches and save them
batch_size = 300
embeddings = []
starting_batch = 118
for i in range(starting_batch*batch_size, len(df_nosample), batch_size):
    print(f"Processing batch {i//batch_size} from {i} to {i+batch_size}")
    batch_embeddings = df_nosample["text"].iloc[i:i+batch_size].apply(lambda x: get_embedding(x)).tolist()
    df_nosample.iloc[i:i+batch_size, df_nosample.columns.get_loc('embeddings')] = pd.Series(batch_embeddings, index=df_nosample.index[i:i+batch_size])
    df_nosample.to_csv('../1_data/all_papers_2024-10-02_embeddings.csv', index=False)


# Read the already created dataset
If you already obtained embeddings and only need to read:

In [None]:
df_nosample = pd.read_csv('../1_data/all_papers_2024-10-02_embeddings.csv')
df_nosample['embeddings'] = df_nosample['embeddings'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
    
# Print how many embeddings are not nan
print(df_nosample['embeddings'].apply(lambda x: not np.isnan(x).all()).sum())
df_nosample.head(2)

In [None]:
df_sample = pd.read_csv('../1_data/sample1000_papers.csv')
print(df_sample.shape)
df_sample.head(2)

In [None]:
df_sample = pd.read_csv('../1_data/sample1000_papers.csv')
print(df_sample.shape)
df_sample2 = pd.read_csv('../1_data/sample1000_papers_embeddings.csv', usecols=['ID', 'embeddings'])
df_sample2['embeddings'] = df_sample2['embeddings'].apply(lambda x: ast.literal_eval(x))
df_sample = pd.merge(df_sample, df_sample2, on='ID', how='left')
df_sample = pd.merge(df_sample, df_nosample[['ID', 'embeddings']], on='ID', how='left')
df_sample['embeddings'] = df_sample['embeddings_x'].fillna(df_sample['embeddings_y'])

df_sample.drop(columns=['embeddings_x', 'embeddings_y'], inplace=True)
df_sample

In [None]:
# Remove the rows that have same ID as the sample
df_nosample = df_nosample[~df_nosample.ID.isin(df_sample.ID.values)]
df_sample2 = None

In [None]:
# RIDGE REGRESSION
# Nested cross-validation training
X = np.array(df_sample['embeddings'].tolist())
y = LabelEncoder().fit_transform(df_sample['selected_llm'])

# Define the cross-validation strategy
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Set up the Ridge Classifier and hyperparameter grid
ridge_clf = RidgeClassifier()
param_grid = {'alpha': np.logspace(-2, 2, 50)}

# Nested cross-validation for hyperparameter tuning and model evaluation
grid_search = GridSearchCV(estimator=ridge_clf, param_grid=param_grid, cv=inner_cv, scoring='accuracy')
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

print("Nested CV Mean Accuracy:", nested_scores.mean())
print("Nested CV Accuracy Std Dev:", nested_scores.std())

# Example of finding the best alpha from nested CV
best_alphas = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]
    
    grid_search.fit(X_train_fold, y_train_fold)
    best_alphas.append(grid_search.best_params_['alpha'])

# Choose the most common best alpha (or average if necessary)
final_alpha = np.median(best_alphas)  # or use any preferred selection method

print("Best alpha found from nested CV:", final_alpha)


In [None]:
final_model = RidgeClassifier(alpha=final_alpha)

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X, y, cv=outer_cv, scoring=scoring)

# Print the results
print("\nCross-Validation Results:")
print("Precision: ", cv_results['test_precision'].mean())
print("Recall: ", cv_results['test_recall'].mean())
print("F1 Score: ", cv_results['test_f1'].mean())
print("Accuracy: ", cv_results['test_accuracy'].mean())


In [None]:
# Train the model
final_model.fit(X, y)

# Predict on all papers for those that have embeddings
X_all = np.array(df_nosample[df_nosample['embeddings'].notna()]['embeddings'].tolist())
y_all = final_model.predict(X_all)

# Add predictions to the dataframe
df_nosample.loc[df_nosample['embeddings'].notna(), 'prediction'] = y_all

In [None]:
# Count predictions
df_nosample['prediction'].value_counts()

In [None]:
# Check a random sample of Abstracts from selected papers
sample_abstracts = df_nosample.loc[df_nosample['prediction'] == 1, "Abstract"].sample(5)

for i, abstract in enumerate(sample_abstracts):
    print(f"Abstract {i+1}:\n")
    print(textwrap.fill(abstract, width=150))
    print("\n\n")

In [None]:
# Save selected papers
all_df = pd.concat([df_sample, df_nosample], ignore_index=True)
all_df

In [None]:
total = pd.concat([all_df.loc[(all_df['human_labeled'] == 1) & (all_df['selected'] == 'yes')],
          all_df.loc[(all_df['human_labeled'] == 0) & (all_df['selected_llm'] == 'yes')],
          all_df.loc[all_df['prediction'] == 1]])
print(len(total))
total.to_csv('../1_data/selected_papers.csv', index=False)