<a href="https://colab.research.google.com/github/adinplb/largedataset-JRec/blob/main/Job_Recommendation_Model_Training_%26_Inference_(Google_Colab).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import logging
import os
from datetime import datetime
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, losses, InputExample
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
import traceback

In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# --- 1. Configuration ---
# Path in the Colab environment where the final model will be saved/loaded from.
# The script will train and save the model here if it doesn't exist.
DEFAULT_TRAINED_MODEL_OUTPUT_DIR = "/content/trained_job_recommender_model"

# Default paths to original CSV data files from GitHub
DEFAULT_JOBS_CSV_SOURCE = "https://raw.githubusercontent.com/adinplb/tsdae-embeddings/refs/heads/master/dataset/Filtered_Jobs_4000.csv"
DEFAULT_ONET_CSV_SOURCE = "https://raw.githubusercontent.com/adinplb/tsdae-embeddings/refs/heads/master/dataset/Occupation%20Data.csv"

# Base model for training
BASE_MODEL_NAME_FOR_TRAINING = 'sentence-transformers/all-MiniLM-L6-v2'

# Training Hyperparameters
TSDAE_EPOCHS = 1
TSDAE_BATCH_SIZE = 32  # Increase if you have a powerful GPU
TSDAE_LEARNING_RATE = 3e-5
TSDAE_MAX_SEQ_LENGTH = 256

SBERT_EPOCHS = 1
SBERT_BATCH_SIZE = 16
SBERT_LEARNING_RATE = 2e-5

In [4]:
def process_jobs_csv_for_tsdae(filepath_or_df):
    """
    Reads the jobs CSV, combines relevant text columns into single strings.
    Returns both the original DataFrame and the list of processed texts.
    """
    logger.info(f"Processing jobs data. Input type: {type(filepath_or_df)}")
    try:
        if isinstance(filepath_or_df, str):
            logger.info(f"Reading jobs CSV from: {filepath_or_df}")
            jobs_df = pd.read_csv(filepath_or_df)
        elif isinstance(filepath_or_df, pd.DataFrame):
            logger.info("Using provided DataFrame for jobs data.")
            jobs_df = filepath_or_df.copy()
        else:
            logger.error("Invalid input for jobs data: Expected filepath string or pandas DataFrame.")
            return None, []
    except Exception as e:
        logger.error(f"Error processing jobs data source {filepath_or_df}: {e}")
        return None, []

    columns_to_combine = [
        'Job.ID', 'Status', 'Title', 'Position', 'Company', 'City', 'State.Name',
        'Industry', 'Job.Description', 'Requirements', 'Salary', 'Employment.Type',
        'Education.Required'
    ]
    existing_columns = [col for col in columns_to_combine if col in jobs_df.columns]
    if not existing_columns:
        logger.error("No specified columns for TSDAE found in the jobs CSV/DataFrame.")
        return jobs_df.copy(), []

    logger.info(f"Combining columns for TSDAE: {existing_columns}")
    jobs_df_filled = jobs_df[existing_columns].fillna('').astype(str)
    processed_texts = jobs_df_filled.agg(' '.join, axis=1).tolist()
    cleaned_texts = [text.replace('\n', ' ').replace('\r', ' ') for text in processed_texts]

    logger.info(f"Processed {len(cleaned_texts)} job entries for TSDAE.")
    return jobs_df.copy(), cleaned_texts

def process_onet_csv_for_sbert_training(filepath_or_df):
    """
    Reads the ONET CSV and creates a list of InputExample objects.
    """
    logger.info(f"Processing ONET data. Input type: {type(filepath_or_df)}")
    examples = []
    try:
        if isinstance(filepath_or_df, str):
            onet_df = pd.read_csv(filepath_or_df)
        elif isinstance(filepath_or_df, pd.DataFrame):
            onet_df = filepath_or_df.copy()
        else:
            logger.error("Invalid input for ONET data.")
            return []
    except Exception as e:
        logger.error(f"Error processing ONET data source {filepath_or_df}: {e}")
        return []

    if 'Title' not in onet_df.columns or 'Description' not in onet_df.columns:
        logger.error("'Title' or 'Description' column not found in ONET CSV/DataFrame.")
        return []

    for _, row in onet_df.iterrows():
        title = str(row['Title']).replace('\n', ' ').replace('\r', ' ')
        description = str(row['Description']).replace('\n', ' ').replace('\r', ' ')
        examples.append(InputExample(texts=[title, description], label=1.0))
    logger.info(f"Processed {len(examples)} ONET entries for SBERT fine-tuning.")
    return examples

In [5]:
def train_model_pipeline(jobs_data_src, onet_data_src, base_model, final_save_path):
    logger.info("--- MODEL TRAINING PIPELINE INITIATED ---")

    # --- Stage 1: TSDAE Pre-training ---
    logger.info("--- Starting Stage 1: TSDAE Pre-training ---")
    _, train_sentences_tsdae = process_jobs_csv_for_tsdae(jobs_data_src)
    if not train_sentences_tsdae:
        logger.error("TSDAE training failed: No job data processed.")
        return False

    logger.info(f"Defining SentenceTransformer model for TSDAE with base: {base_model}")
    word_embedding_model = models.Transformer(base_model, max_seq_length=TSDAE_MAX_SEQ_LENGTH)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
    tsdae_train_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    tsdae_dataset = DenoisingAutoEncoderDataset(train_sentences_tsdae)
    tsdae_dataloader = DataLoader(tsdae_dataset, batch_size=TSDAE_BATCH_SIZE, shuffle=True)
    tsdae_loss = losses.DenoisingAutoEncoderLoss(model=tsdae_train_model, decoder_name_or_path=base_model, tie_encoder_decoder=True)

    logger.info(f"Training TSDAE model for {TSDAE_EPOCHS} epoch(s)... (This is time-consuming)")
    tsdae_train_model.fit(
        train_objectives=[(tsdae_dataloader, tsdae_loss)], epochs=TSDAE_EPOCHS,
        weight_decay=0, scheduler='WarmupLinear', optimizer_params={'lr': TSDAE_LEARNING_RATE},
        warmup_steps=100, show_progress_bar=True, use_amp=True
    )

    # Define a temporary path for the intermediate model
    temp_tsdae_output_path = '/content/temp_tsdae_model'
    os.makedirs(temp_tsdae_output_path, exist_ok=True)
    tsdae_train_model.save(temp_tsdae_output_path)
    logger.info(f"TSDAE pre-training complete. Intermediate model saved to: {temp_tsdae_output_path}")

    # --- Stage 2: SBERT Fine-tuning ---
    logger.info("--- Starting Stage 2: SBERT Fine-tuning ---")
    sbert_model_to_finetune = SentenceTransformer(temp_tsdae_output_path)
    sbert_train_samples = process_onet_csv_for_sbert_training(onet_data_src)
    if not sbert_train_samples:
        logger.error("SBERT fine-tuning failed: No ONET data processed.")
        return False

    num_train_steps_sbert = len(sbert_train_samples) // SBERT_BATCH_SIZE * SBERT_EPOCHS
    sbert_warmup_steps = int(0.1 * num_train_steps_sbert) if num_train_steps_sbert > 0 else 0

    sbert_train_dataloader_mnrl = DataLoader(sbert_train_samples, shuffle=True, batch_size=SBERT_BATCH_SIZE)
    sbert_loss_mnrl = losses.MultipleNegativesRankingLoss(model=sbert_model_to_finetune)

    logger.info(f"Fine-tuning SBERT model for {SBERT_EPOCHS} epoch(s)... (This is time-consuming)")
    sbert_model_to_finetune.fit(
        train_objectives=[(sbert_train_dataloader_mnrl, sbert_loss_mnrl)], epochs=SBERT_EPOCHS,
        warmup_steps=sbert_warmup_steps, optimizer_params={'lr': SBERT_LEARNING_RATE},
        weight_decay=0.01, show_progress_bar=True, use_amp=True, save_best_model=False
    )

    os.makedirs(final_save_path, exist_ok=True)
    sbert_model_to_finetune.save(final_save_path)
    logger.info(f"Model training complete! Fine-tuned model saved to: {final_save_path}")
    return True


In [6]:
  >>> import nltk
  >>> nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
model_file_check_path = os.path.join(DEFAULT_TRAINED_MODEL_OUTPUT_DIR, "pytorch_model.bin")
model = None

if os.path.exists(model_file_check_path):
    logger.info(f"Found existing fine-tuned model at: {DEFAULT_TRAINED_MODEL_OUTPUT_DIR}. Loading model...")
    try:
        model = SentenceTransformer(DEFAULT_TRAINED_MODEL_OUTPUT_DIR)
        logger.info("Model loaded successfully!")
    except Exception as e:
        logger.error(f"Error loading existing model: {e}")
        model = None
else:
    logger.warning(f"Trained model not found at '{DEFAULT_TRAINED_MODEL_OUTPUT_DIR}'.")
    logger.info("Starting training process...")
    try:
        training_successful = train_model_pipeline(
            DEFAULT_JOBS_CSV_SOURCE,
            DEFAULT_ONET_CSV_SOURCE,
            BASE_MODEL_NAME_FOR_TRAINING,
            DEFAULT_TRAINED_MODEL_OUTPUT_DIR
        )
        if training_successful:
            logger.info("Loading newly trained model...")
            model = SentenceTransformer(DEFAULT_TRAINED_MODEL_OUTPUT_DIR)
            logger.info("Newly trained model loaded successfully!")
        else:
            logger.error("Model training failed.")
    except Exception as e_pipe:
        logger.error(f"An uncaught error occurred during the training pipeline: {e_pipe}")
        logger.error(traceback.format_exc())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.se

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33madinplb[0m ([33madinplb-universitas-gadjah-mada-library[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [8]:
if model:
    logger.info("\n--- Starting Job Recommendation Example ---")

    # 1. Load and process the job corpus data
    jobs_df_original, job_corpus_texts = process_jobs_csv_for_tsdae(DEFAULT_JOBS_CSV_SOURCE)

    if jobs_df_original is not None and job_corpus_texts:
        logger.info(f"Using a corpus of {len(job_corpus_texts)} job descriptions for recommendations.")

        # 2. Encode the corpus
        logger.info("Encoding the job corpus... This might take a while.")
        corpus_embeddings = model.encode(job_corpus_texts, convert_to_tensor=True, show_progress_bar=True)
        logger.info("Corpus encoding complete.")

        # 3. Define user query and encode it
        user_query = "Seeking a senior software engineer role specializing in backend development with Python, Django, and cloud services like AWS."
        logger.info(f"User Query: '{user_query}'")
        query_embedding = model.encode(user_query, convert_to_tensor=True)

        # 4. Find the top N most similar jobs
        top_n = 10
        cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cosine_scores, k=min(top_n, len(job_corpus_texts)))

        # 5. Display the results
        print("\n" + "="*50)
        print(f"Top {top_n} Job Recommendations")
        print("="*50)

        results_list = []
        for i, (score, idx) in enumerate(zip(top_results.values, top_results.indices)):
            job_index = idx.item()
            original_job_series = jobs_df_original.iloc[job_index]

            results_list.append({
                "Rank": i + 1,
                "Score": f"{score.item():.4f}",
                "Title": original_job_series.get('Title', 'N/A'),
                "Company": original_job_series.get('Company', 'N/A'),
                "Location": f"{original_job_series.get('City', '')}, {original_job_series.get('State.Name', '')}"
            })

        # Display as a DataFrame for clean output
        recommendations_df = pd.DataFrame(results_list)
        print(recommendations_df.to_string())
        print("="*50 + "\n")

    else:
        logger.error("Could not load job corpus for recommendation.")
else:
    logger.error("Model is not available. Cannot perform recommendations.")

Batches:   0%|          | 0/125 [00:00<?, ?it/s]


Top 10 Job Recommendations
   Rank   Score                                                                            Title                          Company               Location
0     1  0.5668                                        Python Developer @ Paladin Consulting Inc           Paladin Consulting Inc  San Ramon, California
1     2  0.5513                         Professional Skilled Temporary - Software Engineer @ DST                              DST   Southfield, Michigan
2     3  0.5323                              Cloud (AWS) Architect/Lead @ Paladin Consulting Inc           Paladin Consulting Inc  San Ramon, California
3     4  0.5255                            Associate Software Engineer (Multiple) @ DealerSocket                     DealerSocket   Salt Lake City, Utah
4     5  0.5249                                  Java Developer @ Staffing Solutions Enterprises   Staffing Solutions Enterprises            Akron, Ohio
5     6  0.5243                              Software 

In [9]:
'''
# ==============================================================================
# CELL 1: INSTALLATIONS
# ==============================================================================
# Run this cell first to install the required libraries.
# !pip install -q sentence-transformers pandas torch

import logging
import os
from datetime import datetime
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, losses, InputExample
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
import traceback

# ==============================================================================
# CELL 2: SETUP & CONFIGURATION
# ==============================================================================
# --- 0. Setup Logging ---
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# --- 1. Configuration ---
# Path in the Colab environment where the final model will be saved/loaded from.
# The script will train and save the model here if it doesn't exist.
DEFAULT_TRAINED_MODEL_OUTPUT_DIR = "/content/trained_job_recommender_model"

# Default paths to original CSV data files from GitHub
DEFAULT_JOBS_CSV_SOURCE = "https://raw.githubusercontent.com/adinplb/tsdae-embeddings/refs/heads/master/dataset/Filtered_Jobs_4000.csv"
DEFAULT_ONET_CSV_SOURCE = "https://raw.githubusercontent.com/adinplb/tsdae-embeddings/refs/heads/master/dataset/Occupation%20Data.csv"

# Base model for training
BASE_MODEL_NAME_FOR_TRAINING = 'sentence-transformers/all-MiniLM-L6-v2'

# Training Hyperparameters
TSDAE_EPOCHS = 1
TSDAE_BATCH_SIZE = 32  # Increase if you have a powerful GPU
TSDAE_LEARNING_RATE = 3e-5
TSDAE_MAX_SEQ_LENGTH = 256

SBERT_EPOCHS = 1
SBERT_BATCH_SIZE = 16
SBERT_LEARNING_RATE = 2e-5

# ==============================================================================
# CELL 3: HELPER FUNCTIONS FOR DATA PROCESSING
# ==============================================================================
def process_jobs_csv_for_tsdae(filepath_or_df):
    """
    Reads the jobs CSV, combines relevant text columns into single strings.
    Returns both the original DataFrame and the list of processed texts.
    """
    logger.info(f"Processing jobs data. Input type: {type(filepath_or_df)}")
    try:
        if isinstance(filepath_or_df, str):
            logger.info(f"Reading jobs CSV from: {filepath_or_df}")
            jobs_df = pd.read_csv(filepath_or_df)
        elif isinstance(filepath_or_df, pd.DataFrame):
            logger.info("Using provided DataFrame for jobs data.")
            jobs_df = filepath_or_df.copy()
        else:
            logger.error("Invalid input for jobs data: Expected filepath string or pandas DataFrame.")
            return None, []
    except Exception as e:
        logger.error(f"Error processing jobs data source {filepath_or_df}: {e}")
        return None, []

    columns_to_combine = [
        'Job.ID', 'Status', 'Title', 'Position', 'Company', 'City', 'State.Name',
        'Industry', 'Job.Description', 'Requirements', 'Salary', 'Employment.Type',
        'Education.Required'
    ]
    existing_columns = [col for col in columns_to_combine if col in jobs_df.columns]
    if not existing_columns:
        logger.error("No specified columns for TSDAE found in the jobs CSV/DataFrame.")
        return jobs_df.copy(), []

    logger.info(f"Combining columns for TSDAE: {existing_columns}")
    jobs_df_filled = jobs_df[existing_columns].fillna('').astype(str)
    processed_texts = jobs_df_filled.agg(' '.join, axis=1).tolist()
    cleaned_texts = [text.replace('\n', ' ').replace('\r', ' ') for text in processed_texts]

    logger.info(f"Processed {len(cleaned_texts)} job entries for TSDAE.")
    return jobs_df.copy(), cleaned_texts

def process_onet_csv_for_sbert_training(filepath_or_df):
    """
    Reads the ONET CSV and creates a list of InputExample objects.
    """
    logger.info(f"Processing ONET data. Input type: {type(filepath_or_df)}")
    examples = []
    try:
        if isinstance(filepath_or_df, str):
            onet_df = pd.read_csv(filepath_or_df)
        elif isinstance(filepath_or_df, pd.DataFrame):
            onet_df = filepath_or_df.copy()
        else:
            logger.error("Invalid input for ONET data.")
            return []
    except Exception as e:
        logger.error(f"Error processing ONET data source {filepath_or_df}: {e}")
        return []

    if 'Title' not in onet_df.columns or 'Description' not in onet_df.columns:
        logger.error("'Title' or 'Description' column not found in ONET CSV/DataFrame.")
        return []

    for _, row in onet_df.iterrows():
        title = str(row['Title']).replace('\n', ' ').replace('\r', ' ')
        description = str(row['Description']).replace('\n', ' ').replace('\r', ' ')
        examples.append(InputExample(texts=[title, description], label=1.0))
    logger.info(f"Processed {len(examples)} ONET entries for SBERT fine-tuning.")
    return examples

# ==============================================================================
# CELL 4: MODEL TRAINING PIPELINE FUNCTION
# ==============================================================================
def train_model_pipeline(jobs_data_src, onet_data_src, base_model, final_save_path):
    logger.info("--- MODEL TRAINING PIPELINE INITIATED ---")

    # --- Stage 1: TSDAE Pre-training ---
    logger.info("--- Starting Stage 1: TSDAE Pre-training ---")
    _, train_sentences_tsdae = process_jobs_csv_for_tsdae(jobs_data_src)
    if not train_sentences_tsdae:
        logger.error("TSDAE training failed: No job data processed.")
        return False

    logger.info(f"Defining SentenceTransformer model for TSDAE with base: {base_model}")
    word_embedding_model = models.Transformer(base_model, max_seq_length=TSDAE_MAX_SEQ_LENGTH)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
    tsdae_train_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    tsdae_dataset = DenoisingAutoEncoderDataset(train_sentences_tsdae)
    tsdae_dataloader = DataLoader(tsdae_dataset, batch_size=TSDAE_BATCH_SIZE, shuffle=True)
    tsdae_loss = losses.DenoisingAutoEncoderLoss(model=tsdae_train_model, decoder_name_or_path=base_model, tie_encoder_decoder=True)

    logger.info(f"Training TSDAE model for {TSDAE_EPOCHS} epoch(s)... (This is time-consuming)")
    tsdae_train_model.fit(
        train_objectives=[(tsdae_dataloader, tsdae_loss)], epochs=TSDAE_EPOCHS,
        weight_decay=0, scheduler='WarmupLinear', optimizer_params={'lr': TSDAE_LEARNING_RATE},
        warmup_steps=100, show_progress_bar=True, use_amp=True
    )

    # Define a temporary path for the intermediate model
    temp_tsdae_output_path = '/content/temp_tsdae_model'
    os.makedirs(temp_tsdae_output_path, exist_ok=True)
    tsdae_train_model.save(temp_tsdae_output_path)
    logger.info(f"TSDAE pre-training complete. Intermediate model saved to: {temp_tsdae_output_path}")

    # --- Stage 2: SBERT Fine-tuning ---
    logger.info("--- Starting Stage 2: SBERT Fine-tuning ---")
    sbert_model_to_finetune = SentenceTransformer(temp_tsdae_output_path)
    sbert_train_samples = process_onet_csv_for_sbert_training(onet_data_src)
    if not sbert_train_samples:
        logger.error("SBERT fine-tuning failed: No ONET data processed.")
        return False

    num_train_steps_sbert = len(sbert_train_samples) // SBERT_BATCH_SIZE * SBERT_EPOCHS
    sbert_warmup_steps = int(0.1 * num_train_steps_sbert) if num_train_steps_sbert > 0 else 0

    sbert_train_dataloader_mnrl = DataLoader(sbert_train_samples, shuffle=True, batch_size=SBERT_BATCH_SIZE)
    sbert_loss_mnrl = losses.MultipleNegativesRankingLoss(model=sbert_model_to_finetune)

    logger.info(f"Fine-tuning SBERT model for {SBERT_EPOCHS} epoch(s)... (This is time-consuming)")
    sbert_model_to_finetune.fit(
        train_objectives=[(sbert_train_dataloader_mnrl, sbert_loss_mnrl)], epochs=SBERT_EPOCHS,
        warmup_steps=sbert_warmup_steps, optimizer_params={'lr': SBERT_LEARNING_RATE},
        weight_decay=0.01, show_progress_bar=True, use_amp=True, save_best_model=False
    )

    os.makedirs(final_save_path, exist_ok=True)
    sbert_model_to_finetune.save(final_save_path)
    logger.info(f"Model training complete! Fine-tuned model saved to: {final_save_path}")
    return True

# ==============================================================================
# CELL 5: LOAD OR TRAIN THE MODEL
# ==============================================================================
model_file_check_path = os.path.join(DEFAULT_TRAINED_MODEL_OUTPUT_DIR, "pytorch_model.bin")
model = None

if os.path.exists(model_file_check_path):
    logger.info(f"Found existing fine-tuned model at: {DEFAULT_TRAINED_MODEL_OUTPUT_DIR}. Loading model...")
    try:
        model = SentenceTransformer(DEFAULT_TRAINED_MODEL_OUTPUT_DIR)
        logger.info("Model loaded successfully!")
    except Exception as e:
        logger.error(f"Error loading existing model: {e}")
        model = None
else:
    logger.warning(f"Trained model not found at '{DEFAULT_TRAINED_MODEL_OUTPUT_DIR}'.")
    logger.info("Starting training process...")
    try:
        training_successful = train_model_pipeline(
            DEFAULT_JOBS_CSV_SOURCE,
            DEFAULT_ONET_CSV_SOURCE,
            BASE_MODEL_NAME_FOR_TRAINING,
            DEFAULT_TRAINED_MODEL_OUTPUT_DIR
        )
        if training_successful:
            logger.info("Loading newly trained model...")
            model = SentenceTransformer(DEFAULT_TRAINED_MODEL_OUTPUT_DIR)
            logger.info("Newly trained model loaded successfully!")
        else:
            logger.error("Model training failed.")
    except Exception as e_pipe:
        logger.error(f"An uncaught error occurred during the training pipeline: {e_pipe}")
        logger.error(traceback.format_exc())

# ==============================================================================
# CELL 6: JOB RECOMMENDATION EXAMPLE
# ==============================================================================
if model:
    logger.info("\n--- Starting Job Recommendation Example ---")

    # 1. Load and process the job corpus data
    jobs_df_original, job_corpus_texts = process_jobs_csv_for_tsdae(DEFAULT_JOBS_CSV_SOURCE)

    if jobs_df_original is not None and job_corpus_texts:
        logger.info(f"Using a corpus of {len(job_corpus_texts)} job descriptions for recommendations.")

        # 2. Encode the corpus
        logger.info("Encoding the job corpus... This might take a while.")
        corpus_embeddings = model.encode(job_corpus_texts, convert_to_tensor=True, show_progress_bar=True)
        logger.info("Corpus encoding complete.")

        # 3. Define user query and encode it
        user_query = "Seeking a senior software engineer role specializing in backend development with Python, Django, and cloud services like AWS."
        logger.info(f"User Query: '{user_query}'")
        query_embedding = model.encode(user_query, convert_to_tensor=True)

        # 4. Find the top N most similar jobs
        top_n = 10
        cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cosine_scores, k=min(top_n, len(job_corpus_texts)))

        # 5. Display the results
        print("\n" + "="*50)
        print(f"Top {top_n} Job Recommendations")
        print("="*50)

        results_list = []
        for i, (score, idx) in enumerate(zip(top_results.values, top_results.indices)):
            job_index = idx.item()
            original_job_series = jobs_df_original.iloc[job_index]

            results_list.append({
                "Rank": i + 1,
                "Score": f"{score.item():.4f}",
                "Title": original_job_series.get('Title', 'N/A'),
                "Company": original_job_series.get('Company', 'N/A'),
                "Location": f"{original_job_series.get('City', '')}, {original_job_series.get('State.Name', '')}"
            })

        # Display as a DataFrame for clean output
        recommendations_df = pd.DataFrame(results_list)
        print(recommendations_df.to_string())
        print("="*50 + "\n")

    else:
        logger.error("Could not load job corpus for recommendation.")
else:
    logger.error("Model is not available. Cannot perform recommendations.")

'''

