In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

In [23]:
# this dataset is downloaded from kaggle
# https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
df = pd.read_csv(r'../../data/Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [24]:
category_list = list(dict.fromkeys(df['Category'].tolist()))
print(category_list)
print(len(category_list))

['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE', 'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE', 'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE', 'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT', 'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION']
24


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB


In [26]:
df['Category'].head()

0    HR
1    HR
2    HR
3    HR
4    HR
Name: Category, dtype: object

In [27]:
X = df['Resume_str'].tolist()
y = df["Category"].astype("category").cat.codes
label_map = dict(enumerate(df["Category"].astype("category").cat.categories))
label_map

{0: 'ACCOUNTANT',
 1: 'ADVOCATE',
 2: 'AGRICULTURE',
 3: 'APPAREL',
 4: 'ARTS',
 5: 'AUTOMOBILE',
 6: 'AVIATION',
 7: 'BANKING',
 8: 'BPO',
 9: 'BUSINESS-DEVELOPMENT',
 10: 'CHEF',
 11: 'CONSTRUCTION',
 12: 'CONSULTANT',
 13: 'DESIGNER',
 14: 'DIGITAL-MEDIA',
 15: 'ENGINEERING',
 16: 'FINANCE',
 17: 'FITNESS',
 18: 'HEALTHCARE',
 19: 'HR',
 20: 'INFORMATION-TECHNOLOGY',
 21: 'PUBLIC-RELATIONS',
 22: 'SALES',
 23: 'TEACHER'}

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [29]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embed_with_e5(texts, model, tokenizer):
    texts = [f"passage: {t}" for t in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    return mean_pooling(model_output, inputs["attention_mask"]).numpy()

def embed_with_sentence_transformer(texts, model):
    return model.encode(texts, batch_size=32, show_progress_bar=True)


In [31]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# E5 Model
e5_tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")
e5_model = AutoModel.from_pretrained("intfloat/e5-base-v2")

# MPNet Model
mpnet_model = SentenceTransformer("all-mpnet-base-v2")

# MiniLM Model
minilm_model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed for E5 (Unified)
# X_train_e5 = embed_with_e5(X_train, e5_model, e5_tokenizer)
# X_test_e5 = embed_with_e5(X_test, e5_model, e5_tokenizer)

# Embed for MPNet (Split)
X_train_mpnet = embed_with_sentence_transformer(X_train, mpnet_model)
X_test_mpnet = embed_with_sentence_transformer(X_test, mpnet_model)

# Embed for MiniLM (Split)
X_train_minilm = embed_with_sentence_transformer(X_train, minilm_model)
X_test_minilm = embed_with_sentence_transformer(X_test, minilm_model)


Batches: 100%|██████████| 63/63 [12:36<00:00, 12.01s/it]
Batches: 100%|██████████| 16/16 [03:10<00:00, 11.92s/it]
Batches: 100%|██████████| 63/63 [01:06<00:00,  1.06s/it]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.04s/it]


In [32]:
def evaluate_embeddings(X_train, y_train, X_test, y_test, label_map, name="Model"):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 (macro):", f1_score(y_test, y_pred, average="macro"))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_map.values()))

# Evaluate both
#evaluate_embeddings(X_train_e5, y_train, X_test_e5, y_test, label_map, name="Unified (E5)")
evaluate_embeddings(X_train_mpnet, y_train, X_test_mpnet, y_test, label_map, name="Split (MPNet)")
evaluate_embeddings(X_train_minilm, y_train, X_test_minilm, y_test, label_map, name="Split (MiniLM)")



--- Split (MPNet) ---
Accuracy: 0.7424547283702213
F1 (macro): 0.6748533940736242
Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.82      0.96      0.88        24
              ADVOCATE       0.64      0.67      0.65        24
           AGRICULTURE       1.00      0.62      0.76        13
               APPAREL       0.57      0.21      0.31        19
                  ARTS       0.60      0.29      0.39        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.79      0.79      0.79        24
               BANKING       0.63      0.74      0.68        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.92      0.92      0.92        24
                  CHEF       0.86      0.79      0.83        24
          CONSTRUCTION       0.84      0.73      0.78        22
            CONSULTANT       0.71      0.65      0.68        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Conclusion
First, e5 embedding model needs a lot of memory and we can not afford it to run. Second, there is no need for hybrid embedding models since the f1 scores are relatively close though the mpnet model is better.