In [3]:
import pandas as pd
import numpy as np

# Load metadata
file_path = "./goldstandardPapers/reclassified_papers_checkpoint.csv"
df = pd.read_csv(file_path)

# Load embeddings
embeddings_path = "specter2_reclassified.npy"
embeddings = np.load(embeddings_path)

# Check label categories
columns_to_check = ["new_catalysis_type", "new_application_theme", "standard_class", "cited_by_patent"]

for col in columns_to_check:
    print(f"\nUnique values in {col}:")
    print(df[col].unique())


✅ Unique values in new_catalysis_type:
['bio' 'not_catalysis' 'electro' 'organo' 'hetero' 'photo' 'homo'
 'unknown']

✅ Unique values in new_application_theme:
['co2 utilisation' 'biomass' 'unknown' 'water' 'ammonium' 'enzyme'
 'methane']

✅ Unique values in standard_class:
['hard_negative' 'hard_positive' 'soft_positive' 'soft_negative'
 'background']

✅ Unique values in cited_by_patent:
['unknown' 'TRUE' 'FALSE']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical labels
label_encoders = {}
for col in columns_to_check:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert to numeric
    label_encoders[col] = le  # Store encoder for later

# Define input (X) and output (y)
X = embeddings
y = df[columns_to_check].values  # Multi-label target

# Split into train/test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nData split: {X_train.shape[0]} train samples, {X_test.shape[0]} test samples")


Data split: 3083 train samples, 771 test samples


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize base model
base_model = RandomForestClassifier(n_estimators=200, random_state=42)

# Multi-output classifier
multi_clf = MultiOutputClassifier(base_model)
multi_clf.fit(X_train, y_train)

print("\n Model training complete!")


✅ Model training complete!


In [6]:
from sklearn.metrics import classification_report

# Predict on test data
y_pred = multi_clf.predict(X_test)

# Evaluate each output separately
for i, col in enumerate(columns_to_check):
    print(f"\n Classification report for {col}:")
    print(classification_report(y_test[:, i], y_pred[:, i]))


 Classification report for new_catalysis_type:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88       107
           1       0.87      0.75      0.80       110
           2       0.69      0.84      0.75       209
           3       0.63      0.68      0.65       106
           4       0.64      0.53      0.58        78
           5       0.58      0.38      0.46        47
           6       0.90      0.74      0.81       111
           7       0.00      0.00      0.00         3

    accuracy                           0.74       771
   macro avg       0.64      0.61      0.62       771
weighted avg       0.74      0.74      0.74       771


 Classification report for new_application_theme:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.66      0.52      0.58        93
           2       0.67      0.40      0.50        87
           3       0.83      0.86

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
def classify_new_papers(new_embeddings):
    """
    Takes new embeddings as input and predicts the 4 classification labels.
    """
    predictions = multi_clf.predict(new_embeddings)

    # Convert numerical labels back to categorical values
    decoded_predictions = {}
    for i, col in enumerate(columns_to_check):
        decoded_predictions[col] = label_encoders[col].inverse_transform(predictions[:, i])

    return decoded_predictions

In [8]:
# Example: New abstract embeddings (assuming they are already computed)
new_paper_embeddings = np.random.rand(3, X.shape[1])  # Dummy data, replace with real embeddings

# Predict classification labels
results = classify_new_papers(new_paper_embeddings)

# Display predictions
for col, preds in results.items():
    print(f"\n📌 Predicted {col}: {preds}")


📌 Predicted new_catalysis_type: ['not_catalysis' 'not_catalysis' 'not_catalysis']

📌 Predicted new_application_theme: ['unknown' 'unknown' 'unknown']

📌 Predicted standard_class: ['soft_positive' 'soft_positive' 'soft_positive']

📌 Predicted cited_by_patent: ['FALSE' 'FALSE' 'FALSE']
