# Hybrid model : rule and prediction with logistic regression

In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from data_processing import merge_title_abstract,preprocess_text_column, generate_label_and_id_mappings, apply_new_id_mapping
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import hamming_loss,f1_score, precision_score

In [2]:
train_base_dir = os.path.dirname('train.parquet')
val_base_dir = os.path.dirname('val.parquet')
train_file_path = os.path.join(train_base_dir, 'data', 'train.parquet')
val_file_path = os.path.join(train_base_dir, 'data', 'val.parquet')
df_train = pd.read_parquet(train_file_path)
df_val = pd.read_parquet(val_file_path)

### Set up for the rule

Define the rule

In [3]:
def rule_prediction1(df,label_new_id):
    df_copy = df.copy()
    nb_sample, _ = df_copy.shape
    nb_id = len(label_new_id)
    predictions = np.zeros((nb_sample,nb_id),dtype=int)
    for k in range(nb_sample):
        text = df_copy['text'].iloc[k]
        text_lowercase = text.lower()
        labels = list(label_new_id.keys())
        for label in labels:
            if label in text_lowercase:
                id = label_new_id.get(label)
                predictions[k][id] = 1
    return predictions

Merge title and abstract into one single column called 'text'

In [4]:
df_val = merge_title_abstract(df_val)


Generate a set of label-ID pairs and the old-new IDs pairs for the modified IDs

In [5]:
label_new_id, old_new_ids = generate_label_and_id_mappings(df_train)

Applies ID mappings to the DataFrame to create a new column with remapped IDs

In [6]:
df_val = apply_new_id_mapping(df_val,old_new_ids)

Result :

In [7]:
df_val.head(2)

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels,text,new_ids
0,2020RNAAS...4..137D,Recommendations for Teaching Introductory Astr...,Colleges and universities around the world wer...,"[1529, 1583, 563, 486, 1145, 74]","[solar system astronomy, stellar astronomy, ga...",Recommendations for Teaching Introductory Astr...,"[1184, 1231, 416, 358, 874, 53]"
1,2023ApJ...949..109L,The ALMA Survey of 70 μm Dark High-mass Clumps...,We present dynamical properties of 294 cores e...,"[787, 1565, 1569, 732, 1302, 844, 847, 1297]","[infrared dark clouds, star forming regions, s...",The ALMA Survey of 70 μm Dark High-mass Clumps...,"[587, 1215, 1219, 545, 1002, 640, 643, 997]"


Predict

In [8]:
predictions_rule = rule_prediction1(df_val,label_new_id)

## Set up for the model

Merge title and abstract into one single column called 'text'

In [9]:
df_train = merge_title_abstract(df_train)

Apply preprocess on text such as :  
- Converts to lowercase
- Removes special characters
- Removes stop words
- Lemmatizes the text

In [10]:
df_train = preprocess_text_column(df_train)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applies ID mappings to the DataFrame to create a new column with remapped IDs

In [11]:
df_train = apply_new_id_mapping(df_train,old_new_ids)

Result :

In [12]:
df_train.head(2)

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels,text,new_ids
0,2020ApJ...891..100S,Dynamic Potential Sputtering of Lunar Analog M...,"Pyroxenes ((Ca, Mg, Fe, Mn)<SUB>2</SUB>Si<SUB>...","[1534, 499, 1692, 948, 1024, 2004]","[solar wind, exosphere, the moon, lunar compos...",ynamic otential puttering unar nalog aterial o...,"[1189, 371, 1316, 714, 772, 1575]"
1,2024ApJ...966L...8B,"Generation of Low-inclination, Neptune-crossin...",The solar system's distant reaches exhibit a w...,"[1705, 1184, 2293]","[trans-neptunian objects, orbits, solar system...",eneration ow-inclination eptune-crossing rans-...,"[1329, 909, 1827]"


Vectorize with the best model obtained

In [13]:
# Case 3 : TFIDF with only 10 000 max feature and excludes terms present in more than 80% / terms present in less than 1% of samples
vectorizer3 = TfidfVectorizer(
    max_features=10000,
    max_df=0.8,
    min_df=0.01
)
TFIDF3 = vectorizer3.fit_transform(df_train['text'])

Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(TFIDF3, df_train['new_ids'], test_size=0.2, random_state=42)


Convert y_train and y_test into binary matrix of size n x m such as :  
- n : is the number of sample
- m : is the number of label

In [15]:
all_classes = sorted({id_ for ids in df_train['new_ids'] for id_ in ids})
mlb = MultiLabelBinarizer(classes=all_classes)
y_train_matrix = mlb.fit_transform(y_train)
y_test_matrix = mlb.transform(y_test)

In [16]:
model = OneVsRestClassifier(LogisticRegression())
model.fit(X_train, y_train_matrix)



In [None]:
# joblib.dump(model, 'models\\model3_tfidf_for_hybrid.pkl')

['models\\model3_tfidf_for_hybrid.pkl']

#### Adapt the val dataset to make the prediction for the model

In [17]:
df_val = merge_title_abstract(df_val)

Merge title and abstract into one single column called 'text'

In [18]:
df_val = preprocess_text_column(df_val)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\reali\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applies ID mappings to the DataFrame to create a new column with remapped IDs

In [19]:
df_val = apply_new_id_mapping(df_val,old_new_ids)

Vectorize

In [20]:
TFIDF_val = vectorizer3.transform(df_val['text'])

Predict

In [21]:
predictions_model = model.predict(TFIDF_val)

### HYBRID

In [22]:
predictions_union = np.logical_or(predictions_model, predictions_rule).astype(int)

# RESULTS

### Independant rule result

In [26]:
y_test_rule = df_val['verified_uat_labels']
all_classes = sorted({id_ for ids in df_train['new_ids'] for id_ in ids})
mlb = MultiLabelBinarizer(classes=all_classes)
y_test_rule_matrix = mlb.fit_transform(y_test_rule)



f1 score

In [27]:
f1 = f1_score(y_test_rule_matrix, predictions_rule,average='samples')
print("Exact Match f1_score for rule :", f1)

Exact Match f1_score for rule : 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


hamming loss

In [None]:
loss = hamming_loss(y_test_rule_matrix, predictions_rule)
print("Exact Match hamming_loss for rule:", loss)

Exact Match hamming_loss for rule: 0.0029766254034689463


precision

In [None]:
precision = precision_score(y_test_rule_matrix, predictions_rule,average='samples')
print("Exact Match precision_score for rule :", precision)

Exact Match precision_score for rule : 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Independant model result

In [33]:
y_test_model = df_val['new_ids']
all_classes = sorted({id_ for ids in df_train['new_ids'] for id_ in ids})
mlb = MultiLabelBinarizer(classes=all_classes)
y_test_model_matrix = mlb.fit_transform(y_test_model)

f1 score

In [34]:
f1 = f1_score(y_test_model_matrix, predictions_model,average='samples')
print("Exact Match f1_score for model :", f1)

Exact Match f1_score for model : 0.12873463359413773


hamming loss

In [35]:
loss = hamming_loss(y_test_model_matrix, predictions_model)
print("Exact Match hamming_loss for model :", loss)

Exact Match hamming_loss for model : 0.0022425779448799346


precision

In [36]:
precision = precision_score(y_test_model_matrix, predictions_model,average='samples')
print("Exact Match precision_score for model :", precision)

Exact Match precision_score for model : 0.26910192837465563


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Hybrid (rule + model) result

In [37]:
y_test_val = df_val['new_ids']
all_classes = sorted({id_ for ids in df_train['new_ids'] for id_ in ids})
mlb = MultiLabelBinarizer(classes=all_classes)
y_test_val_matrix = mlb.fit_transform(y_test_val)

f1 score

In [38]:
f1 = f1_score(y_test_val_matrix, predictions_union,average='samples')
print("Exact Match f1_score for hybrid:", f1)

Exact Match f1_score for hybrid: 0.1977049205307416


hamming loss

In [39]:
loss = hamming_loss(y_test_val_matrix, predictions_union)
print("Exact Match hamming_loss for hybrid :", loss)

Exact Match hamming_loss for hybrid : 0.0043478877735608126


precision

In [40]:
precision = precision_score(y_test_val_matrix, predictions_union,average='samples')
print("Exact Match precision_score for ybrid :", precision)

Exact Match precision_score for ybrid : 0.18360071653137694


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
