In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pickle
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import time
import gc  # For garbage collection

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def clean_text(text):
    """Basic text cleaning."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Preprocess text by cleaning and removing stopwords."""
    text = clean_text(text)
    if not text:
        return ""

    stops = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stops and len(word) > 2]
    return ' '.join(words)

def tokenize_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str) or not text:
        return []

    # First clean the text
    text = clean_text(text)
    if not text:
        return []

    # Split into sentences
    sentences = sent_tokenize(text)

    # Further clean each sentence
    sentences = [preprocess_text(sentence) for sentence in sentences if len(sentence) > 10]
    return sentences

def extract_job_sections(text):
    """Extract relevant sections from a job description."""
    if not isinstance(text, str) or not text:
        return {"full_text": ""}

    # Define section names and possible next sections
    sections = {
        "responsibilities": ["requirements", "qualifications", "about", "benefits", "company"],
        "requirements": ["responsibilities", "qualifications", "about", "benefits", "company"],
        "qualifications": ["responsibilities", "requirements", "about", "benefits", "company"],
        "skills": ["responsibilities", "requirements", "qualifications", "about", "benefits", "company"]
    }

    result = {}
    for section, next_sections in sections.items():
        next_sections_pattern = "|".join(next_sections)
        pattern = fr"(?i)(?:{section})\s*[:\-]*\s*(.*?)(?:(?:{next_sections_pattern})|$)"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            result[section] = match.group(1).strip()

    # If we couldn't find structured sections, use the whole text
    if all(not value for value in result.values()):
        result["full_text"] = text

    return result

def get_document_embedding(sentences, model):
    """Get document embedding by averaging sentence embeddings."""
    if not sentences:
        return np.zeros(384)  # Default embedding dimension for MiniLM-L6

    # Get embeddings for each sentence
    embeddings = model.encode(sentences, show_progress_bar=False)

    # Average the embeddings
    return np.mean(embeddings, axis=0)

def batch_process_jobs(df, model, batch_size=100, start_idx=0, end_idx=None):
    """Process jobs in batches to manage memory usage."""
    if end_idx is None:
        end_idx = len(df)

    job_data = []

    for i in tqdm(range(start_idx, end_idx, batch_size), desc="Processing job batches"):
        batch_end = min(i + batch_size, end_idx)
        batch_df = df.iloc[i:batch_end]

        batch_job_data = []
        for _, row in batch_df.iterrows():
            job_id = row['job_id']
            job_title = row['title']
            job_desc = row['description']

            # Include skills description if available
            skills_desc = row.get('skills_desc', '')

            if pd.isna(job_desc) or not job_desc:
                continue

            # Combine title, description, and skills (if available)
            combined_text = f"{job_title}\n\n{job_desc}"
            if skills_desc and not pd.isna(skills_desc):
                combined_text += f"\n\nSkills: {skills_desc}"

            # Extract sections if possible
            sections = extract_job_sections(combined_text)

            # Process full text
            all_sentences = tokenize_into_sentences(combined_text)
            if not all_sentences:
                continue

            # Get the embedding for the full text
            full_embedding = get_document_embedding(all_sentences, model)

            # Store basic job info and embedding
            job_info = {
                "job_id": job_id,
                "job_title": job_title,
                "embedding": full_embedding,
                "formatted_experience_level": row.get('formatted_experience_level', ''),
                "location": row.get('location', ''),
                "remote_allowed": row.get('remote_allowed', False),
                "work_type": row.get('formatted_work_type', '')
            }

            batch_job_data.append(job_info)

        job_data.extend(batch_job_data)

        # Save intermediate results
        if len(job_data) % (batch_size * 10) == 0 or batch_end == end_idx:
            temp_filename = f"job_embeddings_temp_{start_idx}_{batch_end}.pkl"
            with open(temp_filename, 'wb') as f:
                pickle.dump(job_data, f)
            print(f"Saved intermediate embeddings to {temp_filename} ({len(job_data)} jobs processed)")

        # Manual garbage collection to free memory
        gc.collect()

    return job_data

def main():
    # Start timing
    start_time = time.time()

    # Load the dataset
    print("Loading job postings dataset...")
    job_df = pd.read_csv('postings.csv')  # Replace with your actual filename

    # Print basic dataset info
    print(f"Loaded {len(job_df)} job postings")
    print(f"Columns: {job_df.columns.tolist()}")

    # Initialize the sentence transformer model
    print("Loading sentence transformer model...")
    model_name = 'paraphrase-MiniLM-L6-v2'  # You can change this to a different model
    model = SentenceTransformer(model_name)

    # Process the jobs
    print("Processing jobs and creating embeddings...")

    # Check if we should process the entire dataset or a subset
    total_jobs = len(job_df)
    max_jobs = total_jobs  # Set a lower number for testing, e.g., 1000

    # Determine batch size based on available memory
    batch_size = 100  # Adjust based on your RAM capacity

    job_data = batch_process_jobs(job_df.head(max_jobs), model, batch_size)

    # Save the embeddings to a file
    print("Saving embeddings to file...")
    with open('job_embeddings.pkl', 'wb') as f:
        pickle.dump(job_data, f)

    # Optionally, save a more compact version with just job_id and embedding
    compact_job_data = [{
        'job_id': job['job_id'],
        'embedding': job['embedding']
    } for job in job_data]

    with open('job_embeddings_compact.pkl', 'wb') as f:
        pickle.dump(compact_job_data, f)

    # Calculate and print execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Processing completed in {execution_time:.2f} seconds ({execution_time/60:.2f} minutes)")
    print(f"Generated embeddings for {len(job_data)} jobs")
    print(f"Saved embeddings to job_embeddings.pkl and job_embeddings_compact.pkl")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading job postings dataset...
Loaded 19824 job postings
Columns: ['job_id', 'company_name', 'title', 'description', 'max_salary', 'pay_period', 'location', 'company_id', 'views', 'med_salary', 'min_salary', 'formatted_work_type', 'applies', 'original_listed_time', 'remote_allowed', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'normalized_salary', 'zip_code', 'fips']
Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing jobs and creating embeddings...


Processing job batches:   5%|▌         | 10/199 [01:41<31:22,  9.96s/it]

Saved intermediate embeddings to job_embeddings_temp_0_1000.pkl (1000 jobs processed)


Processing job batches:  10%|█         | 20/199 [03:19<28:19,  9.49s/it]

Saved intermediate embeddings to job_embeddings_temp_0_2000.pkl (2000 jobs processed)


Processing job batches:  15%|█▌        | 30/199 [04:51<26:18,  9.34s/it]

Saved intermediate embeddings to job_embeddings_temp_0_3000.pkl (3000 jobs processed)


Processing job batches:  20%|██        | 40/199 [06:24<24:51,  9.38s/it]

Saved intermediate embeddings to job_embeddings_temp_0_4000.pkl (4000 jobs processed)


Processing job batches:  25%|██▌       | 50/199 [07:54<22:26,  9.04s/it]

Saved intermediate embeddings to job_embeddings_temp_0_5000.pkl (5000 jobs processed)


Processing job batches:  30%|███       | 60/199 [09:24<20:38,  8.91s/it]

Saved intermediate embeddings to job_embeddings_temp_0_6000.pkl (6000 jobs processed)


Processing job batches:  35%|███▌      | 70/199 [10:51<19:15,  8.96s/it]

Saved intermediate embeddings to job_embeddings_temp_0_7000.pkl (7000 jobs processed)


Processing job batches:  40%|████      | 80/199 [12:21<17:48,  8.98s/it]

Saved intermediate embeddings to job_embeddings_temp_0_8000.pkl (8000 jobs processed)


Processing job batches:  45%|████▌     | 90/199 [13:50<16:41,  9.19s/it]

Saved intermediate embeddings to job_embeddings_temp_0_9000.pkl (9000 jobs processed)


Processing job batches:  50%|█████     | 100/199 [15:23<15:42,  9.52s/it]

Saved intermediate embeddings to job_embeddings_temp_0_10000.pkl (10000 jobs processed)


Processing job batches:  55%|█████▌    | 110/199 [16:54<13:54,  9.38s/it]

Saved intermediate embeddings to job_embeddings_temp_0_11000.pkl (11000 jobs processed)


Processing job batches:  60%|██████    | 120/199 [18:27<12:22,  9.40s/it]

Saved intermediate embeddings to job_embeddings_temp_0_12000.pkl (12000 jobs processed)


Processing job batches:  65%|██████▌   | 130/199 [20:01<10:42,  9.31s/it]

Saved intermediate embeddings to job_embeddings_temp_0_13000.pkl (13000 jobs processed)


Processing job batches:  70%|███████   | 140/199 [21:35<09:23,  9.55s/it]

Saved intermediate embeddings to job_embeddings_temp_0_14000.pkl (14000 jobs processed)


Processing job batches:  75%|███████▌  | 150/199 [23:08<07:47,  9.53s/it]

Saved intermediate embeddings to job_embeddings_temp_0_15000.pkl (15000 jobs processed)


Processing job batches:  80%|████████  | 160/199 [24:45<06:17,  9.67s/it]

Saved intermediate embeddings to job_embeddings_temp_0_16000.pkl (16000 jobs processed)


Processing job batches:  85%|████████▌ | 170/199 [26:18<04:31,  9.36s/it]

Saved intermediate embeddings to job_embeddings_temp_0_17000.pkl (17000 jobs processed)


Processing job batches:  90%|█████████ | 180/199 [27:50<02:54,  9.18s/it]

Saved intermediate embeddings to job_embeddings_temp_0_18000.pkl (18000 jobs processed)


Processing job batches:  95%|█████████▌| 190/199 [29:23<01:22,  9.20s/it]

Saved intermediate embeddings to job_embeddings_temp_0_19000.pkl (19000 jobs processed)


Processing job batches: 100%|██████████| 199/199 [30:41<00:00,  9.25s/it]

Saved intermediate embeddings to job_embeddings_temp_0_19824.pkl (19824 jobs processed)
Saving embeddings to file...





Processing completed in 1852.33 seconds (30.87 minutes)
Generated embeddings for 19824 jobs
Saved embeddings to job_embeddings.pkl and job_embeddings_compact.pkl
