In [3]:
import sys, os
sys.path.append('..') 

In [4]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import pandas as pd
from utils.TextPreprocessor import TextPreprocessor

# Data Loading

In [6]:
# Load dataset from hugging face 
# cnamuangtoun/resume-job-description-fit

ds = load_dataset("cnamuangtoun/resume-job-description-fit")
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()

# Create train/validation split
train_df, val_df = train_test_split(train_df, test_size=0.30,
                                   stratify=train_df["label"], random_state=42)

# Create label mapping
label_to_id = {"Good Fit": 0, "No Fit": 1, "Potential Fit": 2}

train_df["labels"] = train_df["label"].map(label_to_id)
val_df["labels"] = val_df["label"].map(label_to_id)
test_df["labels"] = test_df["label"].map(label_to_id)

print(f"Data loaded and split:")
print(f"Training: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")
print(f"Label distribution in training:")
print(train_df["label"].value_counts())

Data loaded and split:
Training: 4368 samples
Validation: 1873 samples
Test: 1759 samples
Label distribution in training:
label
No Fit           2200
Potential Fit    1089
Good Fit         1079
Name: count, dtype: int64


In [7]:
print("Applying text preprocessing...")

"""
Fixes common formatting issues like missing spaces, malformed sections,
and inconsistent punctuation using regex patterns. 
"""
preprocessor = TextPreprocessor()

# Clean all datasets
train_df = preprocessor.process_dataset(train_df)
val_df = preprocessor.process_dataset(val_df)
test_df = preprocessor.process_dataset(test_df)

Applying text preprocessing...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...


In [36]:
# Step 1: Combine resume and job description texts
def combine_texts(resume_text, job_desc_text):
    """Combine resume and job description for feature extraction"""
    return resume_text + " [SEP] " + job_desc_text

print("Combining resume and job description texts...")

# Create combined texts for training and validation
train_combined = [combine_texts(row['resume_text'], row['job_description_text']) 
                  for _, row in train_df.iterrows()]
val_combined = [combine_texts(row['resume_text'], row['job_description_text']) 
                for _, row in val_df.iterrows()]

print(f"Combined texts created:")
print(f"Training: {len(train_combined)} texts")
print(f"Validation: {len(val_combined)} texts")

print(f"\nExample combined text (first 300 chars):")
print(train_combined[0])

Combining resume and job description texts...
Combined texts created:
Training: 4368 texts
Validation: 1873 texts

Example combined text (first 300 chars):
Professional Summary Obtain a position in a professional organization where I can apply my skills and loyalty in exchange for career guidance, training and opportunity for advancement. Core Qualifications Microsoft Office (Word, Excel, Power Point, Access). FCR: Online Application for financial transactions. Experience Project Accountant,07/2012 to 12/2013 Jacobs Engineering Group Inc.-Tucker,, Afghanistan The project funded by USACE- United State Army Corps of Engineers Performed weekly Cash Counts and monthly Bank account reconciliations and reports back to the MTN / DC home office project accountant. Entered all transactions into the WEBFCR and uploaded backup to the WEBFCR on a daily basis Prepared cash flow projects for upcoming months (Cash forecast) and submitted the budget request every month. Uploaded all vendor/ contractor

In [9]:
def create_balanced_subset(df, target_size, label_col='labels', random_state=42):
    """
    Create a balanced subset of specified size
    
    Args:
        df: DataFrame to subset
        target_size: Total target size for subset
        label_col: Column containing labels
        random_state: Random seed
    
    Returns:
        Balanced subset DataFrame
    """
    
    # Get class distribution
    class_counts = Counter(df[label_col])
    n_classes = len(class_counts)
    
    print(f"Original size: {len(df)}")
    print(f"Original distribution: {dict(class_counts)}")
    
    # Calculate samples per class for balanced subset
    samples_per_class = target_size // n_classes
    
    print(f"Target subset size: {target_size}")
    print(f"Samples per class: {samples_per_class}")
    
    # Check if we have enough samples in each class
    min_available = min(class_counts.values())
    if samples_per_class > min_available:
        samples_per_class = min_available
        actual_size = samples_per_class * n_classes
        print(f"⚠️  Adjusted to {samples_per_class} per class (total: {actual_size})")
    
    # Sample from each class
    subset_dfs = []
    for label in sorted(class_counts.keys()):
        class_df = df[df[label_col] == label]
        sampled_df = class_df.sample(n=samples_per_class, random_state=random_state)
        subset_dfs.append(sampled_df)
    
    # Combine and shuffle
    balanced_subset = pd.concat(subset_dfs, ignore_index=True)
    balanced_subset = balanced_subset.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Verify balance
    final_counts = Counter(balanced_subset[label_col])
    print(f"Final distribution: {dict(final_counts)}")
    print(f"Final size: {len(balanced_subset)}")
    
    return balanced_subset

In [10]:
# Create balanced subsets
print("Creating balanced training subset...")
train_subset = create_balanced_subset(train_df, target_size=3000, random_state=42)

print(f"\nCreating balanced validation subset...")
val_subset = create_balanced_subset(val_df, target_size=1200, random_state=42)

print(f"\nCreating balanced test subset...")  
test_subset = create_balanced_subset(test_df, target_size=300, random_state=42)

Creating balanced training subset...
Original size: 4368
Original distribution: {1: 2200, 2: 1089, 0: 1079}
Target subset size: 3000
Samples per class: 1000
Final distribution: {1: 1000, 0: 1000, 2: 1000}
Final size: 3000

Creating balanced validation subset...
Original size: 1873
Original distribution: {0: 463, 1: 943, 2: 467}
Target subset size: 1200
Samples per class: 400
Final distribution: {2: 400, 0: 400, 1: 400}
Final size: 1200

Creating balanced test subset...
Original size: 1759
Original distribution: {1: 857, 2: 444, 0: 458}
Target subset size: 300
Samples per class: 100
Final distribution: {2: 100, 1: 100, 0: 100}
Final size: 300


In [13]:
# Set feature extractors - allows for easy addition of other extraction techniques

from feature_extractors.TfidfFeatureExtractor import TfidfFeatureExtractor
from feature_extractors.FeatureExtractionPipeline import FeatureExtractionPipeline

# Create pipeline
pipeline = FeatureExtractionPipeline()

# Add extractors
print("🔧 Adding feature extractors...")

# TF-IDF extractor
tfidf_extractor = TfidfFeatureExtractor(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.85
)

pipeline.add_extractor(tfidf_extractor)

🔧 Adding feature extractors...
✅ Added TF-IDF to pipeline


<feature_extractors.FeatureExtractionPipeline.FeatureExtractionPipeline at 0x7f95480f8f50>

In [14]:
# Extract features using the pipeline
try:
    results = pipeline.extract_features(train_subset, val_subset)
    test_results = pipeline.transform_all(test_subset)
    
    print(f"✅ Feature extraction completed!")
    print(f"📊 Results summary:")
    for name, result in results.items():
        train_shape = result['train']['shape']
        val_shape = result['val']['shape']
        density = result['train']['density']
        print(f"   {name}: Train{train_shape}, Val{val_shape}, Density:{density:.4f}")
        
except Exception as e:
    print(f"❌ Pipeline extraction failed: {e}")
    print("Let's debug step by step...")

FEATURE EXTRACTION PIPELINE
🔧 Preparing training texts...
✅ Prepared 3000 training texts

🚀 Fitting 1 extractors...
Fitting TF-IDF on 3000 documents...
✅ TF-IDF fitted in 1.91 seconds
   ✓ TF-IDF fitted successfully

--- TRANSFORMING TRAINING DATA ---
🔄 Transforming 3000 texts with 1 extractors...
   ✓ TF-IDF: (3000, 5000), density=0.0964, 1.34s

--- TRANSFORMING VALIDATION DATA ---
🔄 Transforming 1200 texts with 1 extractors...
   ✓ TF-IDF: (1200, 5000), density=0.0960, 0.53s

✅ Feature extraction completed for 1 extractors
🔄 Transforming 300 texts with 1 extractors...
   ✓ TF-IDF: (300, 5000), density=0.0969, 0.14s
✅ Feature extraction completed!
📊 Results summary:
   TF-IDF: Train(3000, 5000), Val(1200, 5000), Density:0.0964


In [15]:
# Get features directly (you only have TF-IDF anyway)
X_train = results['TF-IDF']['train']['features']
X_val = results['TF-IDF']['val']['features']
X_test = test_results['TF-IDF']['features']

# Get labels
y_train = train_subset['labels'].values
y_val = val_subset['labels'].values
y_test = test_subset['labels'].values

In [16]:
import pickle

# Create output directory
output_dir = "../data/processed/"
os.makedirs(output_dir, exist_ok=True)

# Export features
with open(f"{output_dir}/X_train.pkl", 'wb') as f:
    pickle.dump(X_train, f)

with open(f"{output_dir}/X_val.pkl", 'wb') as f:
    pickle.dump(X_val, f)

with open(f"{output_dir}/X_test.pkl", 'wb') as f:
    pickle.dump(X_test, f)

# Export labels
with open(f"{output_dir}/y_train.pkl", 'wb') as f:
    pickle.dump(y_train, f)

with open(f"{output_dir}/y_val.pkl", 'wb') as f:
    pickle.dump(y_val, f)

with open(f"{output_dir}/y_test.pkl", 'wb') as f:
    pickle.dump(y_test, f)