# Rule prediction

In [19]:
import pandas as pd
import numpy as np
import os
from data_processing import merge_title_abstract, generate_label_and_id_mappings, apply_new_id_mapping
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss, accuracy_score,precision_score

### Load the dataset

In [20]:
base_dir = os.path.dirname('train.parquet')
file_path = os.path.join(base_dir, 'data', 'train.parquet')
df = pd.read_parquet(file_path)


In [21]:
df.head(2)

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels
0,2020ApJ...891..100S,Dynamic Potential Sputtering of Lunar Analog M...,"Pyroxenes ((Ca, Mg, Fe, Mn)<SUB>2</SUB>Si<SUB>...","[1534, 499, 1692, 948, 1024, 2004]","[solar wind, exosphere, the moon, lunar compos..."
1,2024ApJ...966L...8B,"Generation of Low-inclination, Neptune-crossin...",The solar system's distant reaches exhibit a w...,"[1705, 1184, 2293]","[trans-neptunian objects, orbits, solar system..."


Merge title and abstract into one single column called 'text'

In [22]:
df = merge_title_abstract(df)

Generate a set of label-ID pairs and the old-new IDs pairs for the modified IDs

In [23]:
label_new_id, old_new_ids = generate_label_and_id_mappings(df)

Applies ID mappings to the DataFrame to create a new column with remapped IDs

In [24]:
df = apply_new_id_mapping(df,old_new_ids)

Result :

In [25]:
df.head(2)

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels,text,new_ids
0,2020ApJ...891..100S,Dynamic Potential Sputtering of Lunar Analog M...,"Pyroxenes ((Ca, Mg, Fe, Mn)<SUB>2</SUB>Si<SUB>...","[1534, 499, 1692, 948, 1024, 2004]","[solar wind, exosphere, the moon, lunar compos...",Dynamic Potential Sputtering of Lunar Analog M...,"[1189, 371, 1316, 714, 772, 1575]"
1,2024ApJ...966L...8B,"Generation of Low-inclination, Neptune-crossin...",The solar system's distant reaches exhibit a w...,"[1705, 1184, 2293]","[trans-neptunian objects, orbits, solar system...","Generation of Low-inclination, Neptune-crossin...","[1329, 909, 1827]"


### Define the rule :

In [26]:
def rule_prediction1(df,label_new_id):
    df_copy = df.copy()
    nb_sample, _ = df_copy.shape
    nb_id = len(label_new_id)
    predictions = np.zeros((nb_sample,nb_id),dtype=int)
    for k in range(nb_sample):
        text = df_copy['text'].iloc[k]
        text_lowercase = text.lower()
        labels = list(label_new_id.keys())
        for label in labels:
            if label in text_lowercase:
                id = label_new_id.get(label)
                predictions[k][id] = 1
    return predictions

### Predict

In [27]:
predictions = rule_prediction1(df,label_new_id)

In [28]:
mlb = MultiLabelBinarizer()
y_test_rule = df['verified_uat_labels']
y_test_rule_matrix = mlb.fit_transform(y_test_rule)

## Results :

F1 score

In [29]:
f1 = f1_score(y_test_rule_matrix, predictions,average='samples')
print("Exact Match f1_score :", f1)

Exact Match f1_score : 0.012079949689174066


Hamming loss

In [30]:
loss = hamming_loss(y_test_rule_matrix, predictions)
print("Exact Match hamming_loss :", loss)

Exact Match hamming_loss : 0.005164829432633973


Accuracy_score

In [31]:
accuracy = accuracy_score(y_test_rule_matrix, predictions)
print("Exact Match accuracy :", accuracy)

Exact Match accuracy : 0.0


Precision_score

In [32]:
precision = precision_score(y_test_rule_matrix, predictions,average='samples')
print("Exact Match precision_score :", precision)

Exact Match precision_score : 0.011787910088296022


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
