In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

RESUME_PATH = "../data/resume/UpdatedResumeDataSet.csv"

def clean_resume(text):
    # Remove URLs, hashtags, mentions, and special characters
    text = re.sub('http\S+\s*', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\S+', '', text)
    text = re.sub('@\S+', '  ', text)
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[^\x00-\x7f]', r' ', text) 
    text = re.sub('\s+', ' ', text)
    return text.strip()

# Load and Clean
df = pd.read_csv(RESUME_PATH)
df['Resume_text'] = df['Resume'].apply(clean_resume) # Kaggle column is usually named 'Resume'

# 1. Split off Pre-Evaluation (10% - Our baseline)
df_main, df_pre_eval = train_test_split(df, test_size=0.10, random_state=42, stratify=df['Category'])

# 2. Split remainder into Training (70%) and Test (20%)
# 0.22 of 0.9 is roughly 20% of the total
df_train, df_test = train_test_split(df_main, test_size=0.22, random_state=42, stratify=df_main['Category'])

print(f"üìä Dataset Split Complete:")
print(f"   - Training Set: {len(df_train)} (To teach Gemma)")
print(f"   - Test Set: {len(df_test)} (The Final Exam)")
print(f"   - Pre-Eval Set: {len(df_pre_eval)} (The Baseline)")

üìä Dataset Split Complete:
   - Training Set: 674 (To teach Gemma)
   - Test Set: 191 (The Final Exam)
   - Pre-Eval Set: 97 (The Baseline)


  text = re.sub('http\S+\s*', ' ', text)
  text = re.sub('#\S+', '', text)
  text = re.sub('@\S+', '  ', text)
  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)


In [11]:
import pandas as pd
import ollama
from tqdm import tqdm
import json

def run_baseline_on_kaggle_subset(subset_df, model_name="gemma3:4b"):
    baseline_logs = []
    correct_hits = 0
    
    print(f"üöÄ Benchmarking {len(subset_df)} resumes against {model_name}...")
    
    for idx, row in tqdm(subset_df.iterrows(), total=len(subset_df)):
        category = row['Category']
        resume_text = row['Resume_text'][:2000] # Limit for speed
        
        prompt = f"""
        TASK: Identify the professional category and evaluate the candidate.
        RESUME: {resume_text}
        
        INSTRUCTIONS:
        1. Identify the Job Category.
        2. Give a fit score (0-10).
        3. Provide a 1-sentence rationale.
        """
        
        response = ollama.generate(model=model_name, prompt=prompt)['response']
        
        # Simple accuracy check: Does the Category appear in the AI response?
        is_correct = 1 if category.lower() in response.lower() else 0
        correct_hits += is_correct
        
        baseline_logs.append({
            "true_category": category,
            "ai_response": response,
            "was_correct": is_correct
        })
        
    accuracy = (correct_hits / len(subset_df)) * 100
    return baseline_logs, accuracy

# Execute Baseline
baseline_data, pre_accuracy = run_baseline_on_kaggle_subset(df_pre_eval)

print(f"\nüìä BASELINE RESULTS:")
print(f"   - Categorization Accuracy: {pre_accuracy:.2f}%")
print(f"   - Metrics saved to 'baseline_results.json'")

# Save for Post-Training comparison
with open('baseline_results.json', 'w') as f:
    json.dump(baseline_data, f)

üöÄ Benchmarking 97 resumes against gemma3:4b...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [04:10<00:00,  2.58s/it]


üìä BASELINE RESULTS:
   - Categorization Accuracy: 70.10%
   - Metrics saved to 'baseline_results.json'





In [20]:
%pip install -U mlx-lm

Note: you may need to restart the kernel to use updated packages.


In [14]:
import json
import os

# Create a 'data' folder for MLX
os.makedirs("../data/mlx_data", exist_ok=True)

def format_for_mlx(df, output_path):
    with open(output_path, "w") as f:
        for _, row in df.iterrows():
            # Gemma 3 Chat Template formatting
            prompt = f"Identify the category and evaluate this resume: {row['Resume_text'][:1000]}"
            completion = f"Category: {row['Category']}. Rationale: Strong matches found for professional standards in {row['Category']}."
            
            # MLX expects a single "text" key per line
            full_text = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n{completion}<end_of_turn>"
            f.write(json.dumps({"text": full_text}) + "\n")

# Using the splits you made earlier
format_for_mlx(df_train, "../data/mlx_data/train.jsonl")
format_for_mlx(df_test, "../data/mlx_data/valid.jsonl")

print("‚úÖ Data files generated in /mlx_data folder!")

‚úÖ Data files generated in /mlx_data folder!


In [16]:
import os
print("Checking data directory:", os.path.abspath("../data/mlx_data"))
print("Files found:", os.listdir("../data/mlx_data"))

Checking data directory: /Users/I060587/Documents/GitHub/intelligent-recruiter/data/mlx_data
Files found: ['train.jsonl', 'valid.jsonl']


In [2]:
# Force install into your specific virtual environment
!../venv/bin/python -m pip install -U mlx-lm mlx

zsh:1: no such file or directory: ../venv/bin/python


In [13]:
from mlx_lm import load, generate
import pandas as pd

# 1. Load the model + your new adapters
model, tokenizer = load(
    "google/gemma-3-4b-it", 
    adapter_path="../gemma_recruiter_adapters_v2"
)

# 2. Pick a few samples from your test set (unseen data)
test_samples = df_test.sample(5)

print(f"{'Actual Category':<20} | {'Gemma 3 Prediction'}")
print("-" * 50)

for _, row in test_samples.iterrows():
    # Format the prompt exactly like we did in training
    prompt = f"<start_of_turn>user\nIdentify the category and evaluate this resume: {row['Resume_text'][:1000]}<end_of_turn>\n<start_of_turn>model\n"
    
    # Generate the response
    response = generate(model, tokenizer, prompt=prompt, max_tokens=50)
    
    print(f"{row['Category']:<20} | {response.strip()}")

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Actual Category      | Gemma 3 Prediction
--------------------------------------------------
SAP Developer        | Category: SAP Developer
Hadoop               | Category: Hadoop
Blockchain           | Category: Blockchain
Health and fitness   | Category: Health and fitness
Testing              | Category: Testing


In [8]:
import json

# Define the path where the test file should go
test_path = "../data/mlx_data/test.jsonl"

with open(test_path, "w") as f:
    for _, row in df_test.iterrows():
        # Matching the exact prompt format used in training
        entry = {
            "input": f"Identify the category and evaluate this resume: {row['Resume_text'][:1000]}",
            "output": f"Category: {row['Category']}. Rationale: Strong matches found for professional standards in {row['Category']}."
        }
        f.write(json.dumps(entry) + "\n")

print(f"‚úÖ Successfully created {test_path} with {len(df_test)} samples.")

‚úÖ Successfully created ../data/mlx_data/test.jsonl with 191 samples.


In [9]:
import json
from sklearn.metrics import accuracy_score, classification_report

y_true = []
y_pred = []

# Load the test data
with open("../data/mlx_data/test.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        # Extract the ground truth category from the 'output' field
        # This assumes your JSONL output was "Category: X"
        actual = data['output'].split("Category: ")[1].split(".")[0].strip()
        y_true.append(actual)
        
        # Get the prediction from the model
        prompt = f"<start_of_turn>user\nIdentify the category and evaluate this resume: {data['input'][:1000]}<end_of_turn>\n<start_of_turn>model\n"
        response = generate(model, tokenizer, prompt=prompt, max_tokens=20)
        
        try:
            prediction = response.strip().split("Category: ")[1].split(".")[0].strip()
        except:
            prediction = "Error"
        y_pred.append(prediction)

# Calculate final stats
print(f"Final Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nDetailed Report:\n", classification_report(y_true, y_pred))

Final Accuracy: 66.49%

Detailed Report:
                            precision    recall  f1-score   support

                 Advocate       0.00      0.00      0.00         4
        Android Developer       0.00      0.00      0.00         0
                     Arts       1.00      0.29      0.44         7
               Automation       0.00      0.00      0.00         0
       Automation Testing       1.00      0.40      0.57         5
               Blockchain       1.00      1.00      1.00         8
         Business Analyst       1.00      1.00      1.00         5
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         8
                 Database       1.00      1.00      1.00         7
          Design Engineer       0.00      0.00      0.00         0
          DevOps Engineer       1.00      0.91      0.95        11
         DotNet Developer       1.00      1.00      1.00         5
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
import json
from sklearn.metrics import accuracy_score, classification_report

y_true = []
y_pred = []

# Load the test data
with open("../data/mlx_data/test.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        # Extract the ground truth category from the 'output' field
        # This assumes your JSONL output was "Category: X"
        actual = data['output'].split("Category: ")[1].split(".")[0].strip()
        y_true.append(actual)
        
        # Get the prediction from the model
        prompt = f"<start_of_turn>user\nIdentify the category and evaluate this resume: {data['input'][:1000]}<end_of_turn>\n<start_of_turn>model\n"
        response = generate(model, tokenizer, prompt=prompt, max_tokens=20)
        
        try:
            prediction = response.strip().split("Category: ")[1].split(".")[0].strip()
        except:
            prediction = "Error"
        y_pred.append(prediction)

# Calculate final stats
print(f"Final Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nDetailed Report:\n", classification_report(y_true, y_pred))

Final Accuracy: 92.15%

Detailed Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      0.29      0.44         7
    Arts Commerce Science       0.00      0.00      0.00         0
             Arts Manager       0.00      0.00      0.00         0
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         8
         Business Analyst       1.00      1.00      1.00         5
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         8
                 Database       1.00      0.71      0.83         7
          DevOps Engineer       0.85      1.00      0.92        11
         DotNet Developer       0.83      1.00      0.91         5
            ETL Developer       0.80      1.00      0.89         8
   Electrical Engin

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [11]:
import json
import os

# 1. Define paths
data_dir = "../data/mlx_data"
train_file = os.path.join(data_dir, "train.jsonl")
valid_file = os.path.join(data_dir, "valid.jsonl")

def strict_mlx_format(df, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            # IMPORTANT: MLX needs a leading space for some completions to tokenize correctly
            # We also ensure NO extra keys are in the dictionary
            entry = {
                "prompt": f"Identify the category and evaluate this resume: {str(row['Resume_text'])[:1000]}",
                "completion": f" Category: {row['Category']}. Rationale: Strong matches found for professional standards in {row['Category']}."
            }
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
    print(f"‚úÖ Verified & Created: {output_path}")

# 2. Re-create the files
strict_mlx_format(df_train, train_file)
strict_mlx_format(df_test, valid_file)

# 3. Final Sanity Check: Read the first line of the new file
with open(train_file, 'r') as f:
    first_line = json.loads(f.readline())
    print(f"\nSanity Check - Keys found: {list(first_line.keys())}")
    if set(first_line.keys()) == {"prompt", "completion"}:
        print("üöÄ KEYS ARE PERFECT.")
    else:
        print("‚ùå KEYS ARE WRONG. Should only be ['prompt', 'completion']")

‚úÖ Verified & Created: ../data/mlx_data/train.jsonl
‚úÖ Verified & Created: ../data/mlx_data/valid.jsonl

Sanity Check - Keys found: ['prompt', 'completion']
üöÄ KEYS ARE PERFECT.


In [12]:
import json
import os
from pathlib import Path

# Use ABSOLUTE paths to avoid path resolution bugs
data_dir = Path("./mlx_data_clean").resolve()
os.makedirs(data_dir, exist_ok=True)

def write_clean_jsonl(df, name):
    path = data_dir / f"{name}.jsonl"
    with open(path, "w", encoding="utf-8") as f:
        for i, (_, row) in enumerate(df.iterrows()):
            entry = {
                "prompt": f"Identify category: {str(row['Resume_text'])[:500]}",
                "completion": f" Category: {row['Category']}"
            }
            # Remove any possible newlines within the text itself
            json_record = json.dumps(entry, ensure_ascii=False)
            f.write(json_record + ("\n" if i < len(df) - 1 else "")) # No trailing newline at end of file
    print(f"‚úÖ Created clean {name}.jsonl at {path}")

write_clean_jsonl(df_train, "train")
write_clean_jsonl(df_test, "valid")

print(f"\nüöÄ COPY THIS PATH FOR YOUR COMMAND: {data_dir}")

‚úÖ Created clean train.jsonl at /Users/I060587/Documents/GitHub/intelligent-recruiter/pipelines/mlx_data_clean/train.jsonl
‚úÖ Created clean valid.jsonl at /Users/I060587/Documents/GitHub/intelligent-recruiter/pipelines/mlx_data_clean/valid.jsonl

üöÄ COPY THIS PATH FOR YOUR COMMAND: /Users/I060587/Documents/GitHub/intelligent-recruiter/pipelines/mlx_data_clean
