In [4]:
import sys
sys.path.append('..')

In [5]:
import warnings
import multiprocessing
warnings.filterwarnings("ignore", category=ResourceWarning)

# Also suppress multiprocessing warnings
import sys
import os
os.environ['PYTHONWARNINGS'] = 'ignore::ResourceWarning'

# Data Loading

In [6]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets

In [7]:
from typing import Tuple, Literal
import pandas as pd

def load_split(
    preprocessing_type: Literal["cleaned_only", "full_process"],
    sampling_method: Literal["undersampled", "oversampled"],
    classification_type: Literal["binary", "multiclass"]
) -> Tuple[
    Tuple[pd.DataFrame, pd.Series],  # train: (X_train, y_train)
    Tuple[pd.DataFrame, pd.Series],  # val: (X_val, y_val)
    Tuple[pd.DataFrame, pd.Series]   # test: (X_test, y_test)
]:
    """
    Load different types of splits from the data
    
    Args:
        preprocessing_type: must be "cleaned_only" or "full_process"
        sampling_method: must be "undersampled" or "oversampled"
        classification_type: must be "binary" or "multiclass"
    
    Returns:
        Tuple of (train, val, test) splits, where each split is (X, y)
        - train: (X_train, y_train)
        - val: (X_val, y_val)  
        - test: (X_test, y_test)
    """
    dataset = load_datasets(
        f"../data/{preprocessing_type}/{sampling_method}")[classification_type]
    split_names = ["train", "val", "test"]

    return tuple([(lambda split: (dataset[split]["X"], dataset[split]["y"]))(split) for split in split_names])

# Experiments

In [8]:
def combine_text(X):
    X = X.copy()

    combined = X["resume_text"].astype(
        str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

In [9]:
splits = load_split(preprocessing_type="cleaned_only", sampling_method="undersampled", classification_type="binary")

In [10]:
SEED = 42

## Experiment 4: Feature Engineering

In [11]:
from utils import ExperimentManager, Experiment
feature_engineering_manager = ExperimentManager(f"../runs/ensemble/feature_engineering", ["Fit", "Not Fit"])

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import FeatureUnion, FunctionTransformer, Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from interpret.glassbox import ExplainableBoostingClassifier

def compute_cosine_similarity(X):
    """
    Compute cosine similarity between resume_text and job_description_text
    for each row in a DataFrame or compatible input.
    Returns a 2D NumPy array of shape (n_samples, 1).
    """

    # Defensive: ensure X is a DataFrame with expected columns
    if isinstance(X, np.ndarray):
        # If it's already an ndarray, we must know column order
        X = pd.DataFrame(X, columns=["resume_text", "job_description_text"])
    elif not isinstance(X, pd.DataFrame):
        raise ValueError("Input X must be a DataFrame or 2D ndarray.")

    if "resume_text" not in X.columns or "job_description_text" not in X.columns:
        raise ValueError("Expected columns 'resume_text' and 'job_description_text' not found.")

    # Flatten all text for vectorizer fit
    all_texts = X["resume_text"].astype(str).tolist() + X["job_description_text"].astype(str).tolist()
    
    # Fit vectorizer
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    vectorizer.fit(all_texts)

    # Compute cosine similarity for each row
    cosine_scores = []
    for idx, row in X.iterrows():
        resume_text = str(row['resume_text'])
        job_text = str(row['job_description_text'])

        tfidf_matrix = vectorizer.transform([resume_text, job_text])
        cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        cosine_scores.append(cos_sim)

    return np.array(cosine_scores).reshape(-1, 1)

def pipeline_factory(params):
    # Base classifiers with some tuning
    clf_lr = LogisticRegression(random_state=SEED)
    
    clf_rf = RandomForestClassifier(random_state=SEED)
    
    clf_nb = BernoulliNB()
    
    # Stacking classifier
    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ExplainableBoostingClassifier(random_state=SEED),
        cv=20,
        n_jobs=1  # Use all available cores
    )
    
    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', TfidfVectorizer()),
                ('selector', SelectKBest(chi2, k=100))
            ])),
            
            # Cosine similarity feature with scaling
            ('cosine_sim', Pipeline([
                ('extract', FunctionTransformer(compute_cosine_similarity, validate=False))
            ]))
        ])),
        ('clf', stacking_clf)
    ])

experiment = Experiment(
name=f"FE 1 cosine sim EBM stack",
description=f"Stack ensemble with EBM and cosine sim feature engineering",
pipeline_factory=pipeline_factory
)

feature_engineering_manager.run_experiment(experiment, splits=splits)



=== Running Experiment: FE 1 cosine sim EBM stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6832

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6821
   Micro F1:     0.6832
   Weighted F1:  0.6821

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6857  R: 0.6832
   Micro    - P: 0.6832  R: 0.6832
   Weighted - P: 0.6857  R: 0.6832

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6642     0.7410     0.7005        857
   Not Fit              0.7071     0.6254     0.6638        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6857     0.6832     0.6821       1714
   weighted avg         0.6857     0.6832     0.6821       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit  

<utils.ExperimentManger.Experiment at 0x7fccde5ebf20>

In [34]:
import spacy
import numpy as np
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
import pandas as pd
import re

class SkillNERCountTransformer(BaseEstimator, TransformerMixin):
    """Return two columns: [resume_skill_cnt, jd_skill_cnt]."""
    
    def __init__(self, model="en_core_web_sm"):
        self.model = model
    
    def __getstate__(self):
        state = self.__dict__.copy()
        state.pop("nlp", None)
        state.pop("skill_extractor", None)
        return state
    
    def __setstate__(self, state):
        self.__dict__.update(state)
        self._lazy_init()
    
    def _lazy_init(self):
        if not hasattr(self, "skill_extractor"):
            import spacy
            from spacy.matcher import PhraseMatcher
            from skillNer.general_params import SKILL_DB
            from skillNer.skill_extractor_class import SkillExtractor
            
            self.nlp = spacy.load(self.model)
            self.skill_extractor = SkillExtractor(
                self.nlp, SKILL_DB, PhraseMatcher
            )
    
    def _clean_text(self, text):
        """Clean and preprocess text to avoid SkillNER issues"""
        if not isinstance(text, str):
            text = str(text)
        
        # Remove or replace problematic characters
        text = re.sub(r'[^\w\s\-\.\,\;\:\!\?\(\)\[\]\/\@\#\$\%\&\*\+\=\<\>\'\"]', ' ', text)
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Ensure text is not empty
        if not text or text.isspace():
            return "No content available"
        
        # Limit text length to avoid memory issues
        if len(text) > 10000:
            text = text[:10000]
        
        return text
    
    def _safe_skill_extraction(self, text):
        """Safely extract skills with error handling"""
        try:
            cleaned_text = self._clean_text(text)
            result = self.skill_extractor.annotate(cleaned_text)
            
            # Extract counts safely
            full_matches = len(result.get("results", {}).get("full_matches", []))
            ngram_scored = len(result.get("results", {}).get("ngram_scored", []))
            
            return full_matches + ngram_scored
            
        except (IndexError, KeyError, AttributeError, ValueError) as e:
            print(f"Warning: SkillNER extraction failed: {e}")
            return 0
        except Exception as e:
            print(f"Unexpected error in skill extraction: {e}")
            return 0
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=["resume_text", "job_description_text"])
        
        self._lazy_init()
        
        res_cnt, jd_cnt = [], []
        
        for _, row in X.iterrows():
            # Safe extraction with error handling
            resume_skills = self._safe_skill_extraction(row["resume_text"])
            jd_skills = self._safe_skill_extraction(row["job_description_text"])
            
            res_cnt.append(resume_skills)
            jd_cnt.append(jd_skills)
        
        # Return 2-column CSR matrix
        dense = np.c_[res_cnt, jd_cnt]
        return sparse.csr_matrix(dense, dtype=np.float32)

In [None]:
def pipeline_factory(params):
    # Base classifiers
    clf_lr = LogisticRegression(random_state=SEED)
    
    clf_rf = RandomForestClassifier(random_state=SEED,)
    
    clf_nb = BernoulliNB()
    
    # Stacking classifier
    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ExplainableBoostingClassifier(random_state=SEED,),
        cv=20,
        n_jobs=1
    )
    
    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=3000)),
                ('selector', SelectKBest(chi2, k=100))
            ])),
            
            # Cosine similarity feature (unscaled - as you found works better)
            ('cosine_sim', FunctionTransformer(compute_cosine_similarity, validate=False)),
            
            # SkillNER features (skill counts)
            ('skillner_features', SkillNERCountTransformer())
        ])),
        ('clf', stacking_clf)
    ])

# Updated experiment
experiment = Experiment(
    name=f"FE 2 cosine sim + skillNER EBM stack",
    description=f"Stack ensemble with EBM, cosine sim, and SkillNER feature engineering",
    pipeline_factory=pipeline_factory
)

feature_engineering_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: FE 2 cosine sim + skillNER EBM stack ===
loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


  vec_similarity = token1.similarity(token2)


