In [1]:
# Cell 1: Import all required libraries
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import TextPreprocessor as tp

# Data Preparation

In [2]:
ds = load_dataset("cnamuangtoun/resume-job-description-fit")
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()

# Create train/validation split
train_df, val_df = train_test_split(train_df, test_size=0.15,
                                   stratify=train_df["label"], random_state=42)

# Create label mapping
label_to_id = {"Good Fit": 0, "No Fit": 1, "Potential Fit": 2}
id_to_label = {0: "Good Fit", 1: "No Fit", 2: "Potential Fit"}

train_df["labels"] = train_df["label"].map(label_to_id)
val_df["labels"] = val_df["label"].map(label_to_id)
test_df["labels"] = test_df["label"].map(label_to_id)

print(f"Data loaded and split:")
print(f"Training: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")
print(f"Label distribution in training:")
print(train_df["label"].value_counts())

Data loaded and split:
Training: 5304 samples
Validation: 937 samples
Test: 1759 samples
Label distribution in training:
label
No Fit           2671
Potential Fit    1322
Good Fit         1311
Name: count, dtype: int64


# Text Preprocessing

In [3]:
print("Applying text preprocessing...")
preprocessor = tp.TextPreprocessor()

# Clean all datasets
train_df = preprocessor.process_dataset(train_df)
val_df = preprocessor.process_dataset(val_df)
test_df = preprocessor.process_dataset(test_df)

print("Text preprocessing completed!")

# Show a before/after example
print("\nExample of cleaned text:")
print("Resume (first 200 chars):", train_df.iloc[0]['resume_text'][:200])
print("Job Description (first 200 chars):", train_df.iloc[0]['job_description_text'][:200])

Applying text preprocessing...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...
Processing resume texts with comprehensive cleaning...
Processing job description texts with comprehensive cleaning...
Text preprocessing completed!

Example of cleaned text:
Resume (first 200 chars): Professional Summary With the attitude of learning I am looking for an internship from summer 2017 to gain as much as knowledge as I can and contribute to the organizations success. Core Qualification
Job Description (first 200 chars): Immediate need 3-Month Contract to Hire- no C 2 C consultants Must be eligible for hire without sponsorship 100% remote working EST hours Must Haves: 5 years of experience AWS, Java with Spring boot, 


# Feature Extraction

In [4]:
# Step 1: Combine resume and job description texts
def combine_texts(resume_text, job_desc_text):
    """Combine resume and job description for feature extraction"""
    return resume_text + " [SEP] " + job_desc_text

print("Combining resume and job description texts...")

# Create combined texts for training and validation
train_combined = [combine_texts(row['resume_text'], row['job_description_text']) 
                  for _, row in train_df.iterrows()]
val_combined = [combine_texts(row['resume_text'], row['job_description_text']) 
                for _, row in val_df.iterrows()]

print(f"Combined texts created:")
print(f"Training: {len(train_combined)} texts")
print(f"Validation: {len(val_combined)} texts")

print(f"\nExample combined text (first 300 chars):")
print(train_combined[0])

Combining resume and job description texts...
Combined texts created:
Training: 5304 texts
Validation: 937 texts

Example combined text (first 300 chars):
Professional Summary With the attitude of learning I am looking for an internship from summer 2017 to gain as much as knowledge as I can and contribute to the organizations success. Core Qualifications C, C++, C#, SQL pl / sql Operating Systems: Windows, Linux, unix HDL / HVL: Verilog, System Verilog Scripting Language: Unix Shell Scripting, PERL, Python, TCLS oftware proficiency: Cadence (Layout, Virtuoso, Spectre), Synopsys (DC Compiler), Modelsim, Questasim, TFS for version controller, Microsoft sql server 2012, Visual studio 2008 Experience Software Engineer,12/2015-08/2016 Torch Technologies, Inc.-Corpus Christi, TX, India Analyze internal processes and recommend and implement procedural changes to improve operations in database. Analyze the requirement and make the necessary changes in the existing modules. Support and maintena

In [5]:
from feature_extractors.TfidfFeatureExtractor import TfidfFeatureExtractor
from feature_extractors.FeatureExtractionPipeline import FeatureExtractionPipeline

# Create pipeline
pipeline = FeatureExtractionPipeline()

# Add extractors
print("🔧 Adding feature extractors...")

# TF-IDF extractor
tfidf_extractor = TfidfFeatureExtractor(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.85
)
pipeline.add_extractor(tfidf_extractor)

🔧 Adding feature extractors...
✅ Added TF-IDF to pipeline


<feature_extractors.FeatureExtractionPipeline.FeatureExtractionPipeline at 0x7ff208442270>

In [6]:
print("=== EXTRACTING FEATURES ===")

# Use smaller subset for initial testing
train_subset = train_df.head(1000)  # Start with 1000 samples
val_subset = val_df.head(200)       # Start with 200 samples

print(f"📊 Using subset for testing:")
print(f"   Training: {len(train_subset)} samples")
print(f"   Validation: {len(val_subset)} samples")

# Extract features using the pipeline
try:
    results = pipeline.extract_features(train_subset, val_subset)
    
    print(f"✅ Feature extraction completed!")
    print(f"📊 Results summary:")
    for name, result in results.items():
        train_shape = result['train']['shape']
        val_shape = result['val']['shape']
        density = result['train']['density']
        print(f"   {name}: Train{train_shape}, Val{val_shape}, Density:{density:.4f}")
        
except Exception as e:
    print(f"❌ Pipeline extraction failed: {e}")
    print("Let's debug step by step...")

=== EXTRACTING FEATURES ===
📊 Using subset for testing:
   Training: 1000 samples
   Validation: 200 samples
FEATURE EXTRACTION PIPELINE
🔧 Preparing training texts...
✅ Prepared 1000 training texts

🚀 Fitting 1 extractors...
Fitting TF-IDF on 1000 documents...
✅ TF-IDF fitted in 0.70 seconds
   ✓ TF-IDF fitted successfully

--- TRANSFORMING TRAINING DATA ---
🔄 Transforming 1000 texts with 1 extractors...
   ✓ TF-IDF: (1000, 5000), density=0.0970, 0.45s

--- TRANSFORMING VALIDATION DATA ---
🔄 Transforming 200 texts with 1 extractors...
   ✓ TF-IDF: (200, 5000), density=0.0931, 0.09s

✅ Feature extraction completed for 1 extractors
✅ Feature extraction completed!
📊 Results summary:
   TF-IDF: Train(1000, 5000), Val(200, 5000), Density:0.0970


In [7]:
# Check what extractors you have
print(f"📊 Available extractors: {list(results.keys())}")

# Get summary statistics
summary = pipeline.get_summary()
print(f"\n📈 Feature Extraction Summary:")
print(summary.round(4))

# Analyze each extractor's results
for extractor_name, result in results.items():
    print(f"\n🔍 {extractor_name} Analysis:")
    train_data = result['train']
    val_data = result['val']
    extractor = result['extractor']
    
    print(f"   Training shape: {train_data['shape']}")
    print(f"   Validation shape: {val_data['shape']}")
    print(f"   Matrix density: {train_data['density']:.4f}")
    print(f"   Memory usage: {train_data['memory_mb']:.1f} MB")
    print(f"   Fit time: {extractor.fit_time:.2f}s")
    print(f"   Transform time: {train_data['transform_time']:.2f}s")

📊 Available extractors: ['TF-IDF']

📈 Feature Extraction Summary:
  Extractor   Train_Shape  Features  Density  Memory_MB  Fit_Time_s  \
0    TF-IDF  (1000, 5000)      5000    0.097     3.7019      0.7025   

   Transform_Time_s  
0            0.4502  

🔍 TF-IDF Analysis:
   Training shape: (1000, 5000)
   Validation shape: (200, 5000)
   Matrix density: 0.0970
   Memory usage: 3.7 MB
   Fit time: 0.70s
   Transform time: 0.45s


In [9]:
# Extract features and labels for ML training
ml_ready_data = {}

for extractor_name, result in results.items():
    X_train_ext = result['train']['features']
    X_val_ext = result['val']['features']
    
    ml_ready_data[extractor_name] = {
        'X_train': X_train_ext,
        'X_val': X_val_ext,
        'train_shape': X_train_ext.shape,
        'val_shape': X_val_ext.shape
    }
    
    print(f"✅ {extractor_name} ready for ML:")
    print(f"   Training features: {X_train_ext.shape}")
    print(f"   Validation features: {X_val_ext.shape}")

# Get labels (same for all extractors)
y_train = train_subset['labels'].values
y_val = val_subset['labels'].values

print(f"\n✅ Labels ready:")
print(f"   Training labels: {len(y_train)}")
print(f"   Validation labels: {len(y_val)}")
print(f"   Label distribution: {np.bincount(y_train)}")

# Choose best extractor for initial ML training
summary = pipeline.get_summary()
best_extractor = summary.loc[summary['Density'].idxmax(), 'Extractor']
print(f"\n🏆 Best extractor (by density): {best_extractor}")

# Set up final variables for ML training (NO GLOBALS NEEDED!)
if best_extractor in ml_ready_data:
    X_train = ml_ready_data[best_extractor]['X_train']
    X_val = ml_ready_data[best_extractor]['X_val']
    
    print(f"✅ Final ML variables created:")
    print(f"   X_train: {X_train.shape}")
    print(f"   X_val: {X_val.shape}")
    print(f"   y_train: {len(y_train)}")
    print(f"   y_val: {len(y_val)}")

=== PREPARING DATA FOR MACHINE LEARNING ===
✅ TF-IDF ready for ML:
   Training features: (1000, 5000)
   Validation features: (200, 5000)

✅ Labels ready:
   Training labels: 1000
   Validation labels: 200
   Label distribution: [226 527 247]

🏆 Best extractor (by density): TF-IDF
✅ Final ML variables created:
   X_train: (1000, 5000)
   X_val: (200, 5000)
   y_train: 1000
   y_val: 200
