In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import sys

In [3]:
# Increase recursion limit for complex data manipulations
sys.setrecursionlimit(2000)

In [41]:
# file paths
hybrid_scores_path = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Hybrid Score Matrix/compatibility_score_matrix_final.csv'
resume_features_path = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Feature Engineering Data/resume_skill_features.csv'
job_skills_features_path = 'C:/Users/ashua/Desktop/Inelligent Job Recomendation Engine/data/Feature Engineering Data/job_skill_features.csv'
training_data_path = 'supervised_training_data.csv'
report_output_path = 'model_performance_report.md'

In [5]:
# Definitive Mapping for the 35 Skill Codes
SKILL_MAPPING = {
    'ACCT': 'Accounting', 'ADM': 'Administration', 'ADVR': 'Advertising', 'ANLS': 'Analysis / Analyst',
    'ART': 'Arts', 'BD': 'Business Development', 'CNST': 'Construction / Consulting', 'DSGN': 'Design',
    'EDCN': 'Education', 'ENG': 'Engineering', 'FASH': 'Fashion', 'FIN': 'Finance',
    'GENB': 'General Business', 'HCPR': 'Healthcare / Health Professions', 'HR': 'Human Resources',
    'IT': 'Information Technology', 'LGL': 'Legal', 'MGMT': 'Management', 'MNFC': 'Manufacturing',
    'MRKT': 'Marketing', 'OTHR': 'Other', 'PR': 'Public Relations', 'PRJM': 'Project Management',
    'PROD': 'Product / Production', 'PRSR': 'Press Relations', 'QA': 'Quality Assurance',
    'REAL': 'Real Estate', 'RSCH': 'Research', 'SALE': 'Sales', 'SCI': 'Science',
    'SPRT': 'Sports / Support', 'SUPL': 'Supply Chain / Logistics', 'TECH': 'Technical',
    'TRNS': 'Transportation / Training', 'WRT': 'Writing',
}

In [6]:
#Load Feature Data
print(f"--- Started Supervised Model Training & Evaluation ---")
try:
    resume_features = pd.read_csv(resume_features_path)
    job_skills_features = pd.read_csv(job_skills_features_path)
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure the file paths are correct.")
    exit()

--- Started Supervised Model Training & Evaluation ---


In [7]:
#Create Supervised Training Data
print(f"Creating Labeled Training Data via Chunked Sampling (Memory Safe)...")
chunk_size = 100 # Number of jobs to process per chunk
Max_Samples_Per_Class = 50
Top_score_threshold = 0.4
Low_score_threshold = 0.1
Sample_per_chunk = 50

all_good_matches = []
all_bad_matches = []    
total_chunks = 0
resume_ids = resume_features['ID'].unique().tolist()
resume_ids_series = pd.Series(resume_ids) # For fast indexing

try:
    # Use pandas read_csv with chunksize to iterate over the massive score matrix
    for chunk in pd.read_csv(hybrid_scores_path, chunksize=chunk_size, index_col=0):
        total_chunks += 1
        
        # Optimized: Iterate column by column (job by job) instead of using chunk.stack()
        for job_id, scores in chunk.items():
            # Filter scores for the current job
            good_scores = scores[scores >= Top_score_threshold]
            bad_scores = scores[scores <= Low_score_threshold]
            
            # 1. Sample Good Matches (Label 1)
            if len(good_scores) > 0 and len(all_good_matches) < Max_Samples_Per_Class:
                # Create a temporary DataFrame for sampling a small amount of good matches
                good_data = pd.DataFrame({
                    'resume_id': good_scores.index.values, 
                    'job_id': job_id,
                    'hybrid_score': good_scores.values
                })
                # Randomly sample, ensuring we don't exceed the max per class
                sample_size = min(len(good_data), Sample_per_chunk)
                all_good_matches.append(good_data.sample(sample_size, random_state=42 + total_chunks))

            # 2. Sample Bad Matches (Label 0)
            if len(bad_scores) > 0 and len(all_bad_matches) < Max_Samples_Per_Class:
                # Create a temporary DataFrame for sampling a small amount of bad matches
                bad_data = pd.DataFrame({
                    'resume_id': bad_scores.index.values, 
                    'job_id': job_id,
                    'hybrid_score': bad_scores.values
                })
                # Randomly sample, ensuring we don't exceed the max per class
                sample_size = min(len(bad_data), Sample_per_chunk)
                all_bad_matches.append(bad_data.sample(sample_size, random_state=42 + total_chunks * 2))

            # Optimization: Early break if we have enough samples
            if len(all_good_matches) >= Max_Samples_Per_Class and len(all_bad_matches) >= Max_Samples_Per_Class:
                print(f"   Early exit: Reached target samples after {total_chunks} chunks.")
                break # Break out of job iteration

        if len(all_good_matches) >= Max_Samples_Per_Class and len(all_bad_matches) >= Max_Samples_Per_Class:
             break # Break out of chunk iteration

except FileNotFoundError as e:
    print(f"Error: Missing the Hybrid Score Matrix file '{hybrid_scores_path}'. Error: {e}")
    exit()
except Exception as e:
    print(f"An error occurred during chunked loading or sampling: {e}")
    print("If this is a MemoryError, try reducing the CHUNK_SIZE or SAMPLE_PER_CHUNK variables.")
    exit()

# Final consolidation and balancing
# Use len(pd.concat(...)) to safely get the total number of consolidated samples before final sampling
if not all_good_matches or not all_bad_matches:
    print("\nSampling failed. Not enough good or bad matches found with current thresholds. Try adjusting thresholds.")
    exit()

Creating Labeled Training Data via Chunked Sampling (Memory Safe)...
   Early exit: Reached target samples after 5 chunks.


In [None]:
# Consolidate and drop duplicates before final sample
good_matches_combined = pd.concat(all_good_matches).drop_duplicates(subset=['resume_id', 'job_id'])
bad_matches_combined = pd.concat(all_bad_matches).drop_duplicates(subset=['resume_id', 'job_id'])

In [12]:
# Ensure we don't sample more than available, and replace=False means no duplicates
good_matches_final = good_matches_combined.sample(n = min(Max_Samples_Per_Class,len(good_matches_combined)), replace=False, random_state=42)
bad_matches_final = bad_matches_combined.sample(n = min(Max_Samples_Per_Class,len(bad_matches_combined)), replace=False, random_state=42)

In [13]:
good_sample_final['label'] = 1
bad_sample_final['label'] = 0

In [14]:
#Create Final Training Data
training_data = pd.concat([good_sample_final, bad_sample_final]).reset_index(drop=True)
print(f" processed {total_chunks} chunks.")
print(f" Consolidated total potential samples (before final sampling):  {len(good_matches_combined) + len(bad_matches_combined)}")
print(f" Created {len(training_data)} balanced training samples (Positive: {len(good_matches_final)}, Negative: {len(bad_matches_final)}).")

 processed 5 chunks.
 Consolidated total potential samples (before final sampling):  2462
 Created 100 balanced training samples (Positive: 50, Negative: 50).


In [16]:
#Consolidate Training Features
skill_cols = job_skills_features.columns.drop('job_id').tolist()

# Prepare feature column names
resume_skill_cols = [f'R_{c}' for c in skill_cols]
job_skill_cols = [f'J_{c}' for c in skill_cols]

In [17]:
#Merge Resume Features
resume_feature_cols = resume_features.set_index('ID')[skill_cols]
resume_feature_cols.columns = resume_skill_cols

#Ensure Index type matches for merging
training_data['resume_id'] = training_data['resume_id'].astype(resume_feature_cols.index.dtype)
training_data = training_data.merge(resume_feature_cols, left_on='resume_id', right_index=True, how='left')

In [18]:
#Merge Job Skill Features
job_skill_feature_cols = job_skills_features.set_index('job_id')[skill_cols]
job_skill_feature_cols.columns = job_skill_cols

#Ensure Index type matches for merging
training_data['job_id'] = training_data['job_id'].astype(job_skill_feature_cols.index.dtype)
training_data = training_data.merge(job_skill_feature_cols, left_on='job_id', right_index=True, how='left')

In [20]:
training_data.to_csv(training_data_path, index=False)
print(f"   Labeled training data saved to: {training_data_path}")

   Labeled training data saved to: supervised_training_data.csv


In [26]:
#Drop rows where the feature data merge failed (i.e., NaN values)
initial_size = len(training_data)
training_data = training_data.dropna(subset=resume_skill_cols + job_skill_cols, how='any').reset_index(drop=True)
dropped_rows = initial_size - len(training_data)
if dropped_rows > 0:
    print(f"   WARNING: Dropped {dropped_rows} samples because their corresponding features were missing (NaNs after merge).")



In [27]:
#Setup for Supervised Model Training
#Define Features (X) and Target (y). The Hybrid Score is NOT used as a feature, only for comparison.
X = training_data[resume_skill_cols + job_skill_cols]
y = training_data['label']

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f" Training set size: {len(X_train)} Samples")
print(f" Test set size: {len(X_test)} Samples")

 Training set size: 75 Samples
 Test set size: 19 Samples


In [28]:
# Train and Evatuate Supervised Models

results = {}

def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    print(f" Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cm = confusion_matrix(y_test, y_pred)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'Model': model
    }
    print(f" {name} Evaluation Metrics: Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}" )
    return model

In [29]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model = evaluate_model(lr_model, "Logistic Regression", X_train, y_train, X_test, y_test)

 Training Logistic Regression...
 Logistic Regression Evaluation Metrics: Accuracy: 1.0000, F1-Score: 1.0000, ROC-AUC: 1.0000


In [30]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model = evaluate_model(rf_model, "Random Forest", X_train, y_train, X_test, y_test)

 Training Random Forest...
 Random Forest Evaluation Metrics: Accuracy: 1.0000, F1-Score: 1.0000, ROC-AUC: 1.0000


In [32]:
#The Hybrid Score (which is already in the training data) is used as the prediction probability
hybrid_test_data = training_data.loc[y_test.index]
hybrid_proba = (hybrid_test_data['hybrid_score'])

# Use a standard classification threshold (0.5) to turn the score into a binary prediction
hybrid_pred = (hybrid_proba > 0.5).astype(int)

In [34]:
#Caluclate Metircs for hybrid score
hybrid_accuracy = accuracy_score(y_test, hybrid_pred)
hybrid_precision = precision_score(y_test, hybrid_pred, zero_division=0)
hybrid_recall = recall_score(y_test, hybrid_pred)
hybrid_f1 = f1_score(y_test, hybrid_pred)
hybrid_roc_auc = roc_auc_score(y_test, hybrid_proba)

results['Hybrid Similarity (Model 1)'] = {
    'Accuracy': hybrid_accuracy,
    'Precision': hybrid_precision,
    'Recall': hybrid_recall,
    'F1-Score': hybrid_f1,
    'ROC-AUC': hybrid_roc_auc,
    'Model': 'N/A' # Not a scikit-learn model object
}
print(f"   Hybrid Model Metrics: Accuracy={hybrid_accuracy:.4f}, F1-Score={hybrid_f1:.4f}, ROC-AUC={hybrid_roc_auc:.4f}")

   Hybrid Model Metrics: Accuracy=0.4737, F1-Score=0.0000, ROC-AUC=1.0000


In [42]:
#Generate Summary Table
def generate_report(results, skill_mapping, skill_codes):
    """Generates the comprehensive Markdown performance report."""
    report_content = "## AI Model Development & Performance Analysis (Phase 2 Report)\n\n"
    report_content += "This analysis compares the performance of the initial Hybrid Similarity Model (**Model 1**) against two supervised classifiers (**Model 2a/2b**) trained on synthetically labeled data.\n\n"

    # --- Skill Mapping Table ---
    report_content += "### Feature Mapping: 35 Explicit Skill Codes\n\n"
    report_content += "The supervised models utilized **70 binary features** (35 for the resume, 35 for the job) derived from the following skill categories:\n\n"

    # Prepare data for 3-column Markdown table
    table_data = [(code, skill_mapping.get(code, f'[Unknown: {code}]')) for code in skill_codes]
    num_codes = len(table_data)
    num_rows = (num_codes + 2) // 3 # Ceiling division
    
    # Start the table header
    report_content += "| Code | Full Skill Name | Code | Full Skill Name | Code | Full Skill Name |\n"
    report_content += "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | \n"

    for i in range(num_rows):
        row_str_parts = []
        for j in range(3): # Three columns
            idx = i + j * num_rows
            if idx < num_codes:
                code, name = table_data[idx]
                row_str_parts.append(f" {code} | {name} ")
            else:
                row_str_parts.append(" | ") # Empty cells for padding
        report_content += "|" + "|".join(row_str_parts) + "|\n"
    report_content += "\n"
    # --- END Skill Mapping Table ---

    # --- Metrics Table ---
    report_content += "### Model Performance Metrics\n\n"
    report_content += "| Model | Accuracy | Precision | Recall | F1-Score | ROC-AUC |\n"
    report_content += "| :--- | :--- | :--- | :--- | :--- | :--- |\n"
    best_f1 = -1
    best_model = ""

    for name, metrics in results.items():
        roc_auc_str = f"{metrics['ROC-AUC']:.4f}" if metrics['ROC-AUC'] != 'N/A' else 'N/A'
        report_content += f"| {name} | {metrics['Accuracy']:.4f} | {metrics['Precision']:.4f} | {metrics['Recall']:.4f} | {metrics['F1-Score']:.4f} | {roc_auc_str} |\n"
        if metrics['F1-Score'] > best_f1:
            best_f1 = metrics['F1-Score']
            best_model = name
        
    report_content += "\n"

    # --- Conclusion ---
    report_content += "### Performance Analysis & Conclusion (Deliverable 3 & 4)\n\n"
    
    # Get metrics for the best model to include in the conclusion
    best_model_metrics = results.get(best_model, {})
    
    report_content += f"The **{best_model}** model achieved the highest F1-Score of **{best_f1:.4f}**.\n\n"
    report_content += f"The **F1-Score** is the key metric for match classification, as it provides a crucial balance between Precision (avoiding recommending bad matches) and Recall (avoiding missing good matches).\n\n"
    
    report_content += "The breakdown for the recommended model shows:\n"
    report_content += f"* **Precision ({best_model_metrics.get('Precision', 0):.4f}):** Indicates that when the model predicts a match, it is highly likely to be correct.\n"
    report_content += f"* **Recall ({best_model_metrics.get('Recall', 0):.4f}):** Indicates the model successfully finds a high percentage of the true positive matches available in the test set.\n\n"
    
    report_content += f"The **Hybrid Similarity Model (Model 1)** provides a strong, interpretable baseline, scoring highly on ROC-AUC, which confirms its ability to correctly rank good matches higher than bad matches across all thresholds. \n\n"
    report_content += "The superior performance of the supervised models (Model 2a/2b) confirms the benefit of explicitly training a classifier on the combined feature set (resume and job skills), with **Random Forest** being the best choice for final deployment."
    
    return report_content

# Determine the final list of skill codes used
skill_cols_unique = job_skills_features.columns.drop('job_id').tolist() 

print("\n7.1. Generating Final Performance Report (Optimized)...")
report_content = generate_report(results, SKILL_MAPPING, skill_cols_unique)
    
with open(report_output_path, 'w') as f:
    f.write(report_content)

print(f"\n✅ All Phase 2 Deliverables COMPLETED! (Attempting to write with memory-optimized sampling) ✅")
print(f"Model performance report saved to: '{report_output_path}'")


7.1. Generating Final Performance Report (Optimized)...

✅ All Phase 2 Deliverables COMPLETED! (Attempting to write with memory-optimized sampling) ✅
Model performance report saved to: 'model_performance_report.md'
