In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import pandas as pd
import json
import os

# 1. Define file paths
file_paths = [
    '/kaggle/input/job-recom-dataset/datascientist.json',
    '/kaggle/input/job-recom-dataset/dataengineer.json',
    '/kaggle/input/job-recom-dataset/phpdeveloper.json',
    '/kaggle/input/job-recom-dataset/javadeveloper.json',
    '/kaggle/input/job-recom-dataset/backenddeveloper.json'
]

all_dataframes = []

print("üöÄ Starting file processing...")

for path in file_paths:
    # Extract category name from filename (e.g., 'datascientist')
    filename = os.path.basename(path).replace('.json', '')
    print(f"\nReading file: {filename} ...")
    
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # --- Intelligent Data Type Detection ---
        temp_df = None
        
        # Case 1: Data is a list of dictionaries [{}, {}, ...]
        if isinstance(data, list):
            print(f"   Data type: List - Items: {len(data)}")
            temp_df = pd.DataFrame(data)
            
        # Case 2: Data is a dictionary {"0": {}, "1": {}}
        elif isinstance(data, dict):
            print(f"   Data type: Dict - Keys: {len(data)}")
            
            # Check for nested 'root' structure if applicable
            if "root" in data: 
                 temp_df = pd.DataFrame(data['root'])
            else:
                 # Standard case: keys are indices, use orient='index'
                 temp_df = pd.DataFrame.from_dict(data, orient='index')
        
        else:
            print(f"   Unknown format: {type(data)}")
            continue

        # --- Post-processing the single DataFrame ---
        if temp_df is not None and not temp_df.empty:
            # Add a 'category' column to track the source
            temp_df['category'] = filename 
            
            # Reset index (prevents the "0", "1" keys from becoming a messy column)
            temp_df = temp_df.reset_index(drop=True)
            
            all_dataframes.append(temp_df)
            print(f"    Success! DataFrame shape: {temp_df.shape}")
        else:
            print("   DataFrame is empty or could not be created.")

    except Exception as e:
        print(f"   Error processing {filename}: {str(e)}")

# --- Final Consolidation ---
if all_dataframes:
    # Merge all dataframes into one
    full_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Remove duplicates based on job description
    before_dedup = len(full_df)
    full_df = full_df.drop_duplicates(subset=['desc'])
    after_dedup = len(full_df)
    
    print("\n" + "="*40)
    print(f" Operation Complete! {before_dedup - after_dedup} duplicates removed.")
    print(f"üìäFinal dataset size: {after_dedup} rows")
    print("="*40)
    
    # Display the first few rows
    print(full_df.head())
else:
    print("\n‚ùå No data available to merge.")


üöÄ Starting file processing...

Reading file: datascientist ...
   Data type: List - Items: 32
    Success! DataFrame shape: (32, 7)

Reading file: dataengineer ...
   Data type: List - Items: 80
    Success! DataFrame shape: (80, 7)

Reading file: phpdeveloper ...
   Data type: List - Items: 8
    Success! DataFrame shape: (8, 7)

Reading file: javadeveloper ...
   Data type: List - Items: 20
    Success! DataFrame shape: (20, 7)

Reading file: backenddeveloper ...
   Data type: List - Items: 5
    Success! DataFrame shape: (5, 7)

 Operation Complete! 31 duplicates removed.
üìäFinal dataset size: 114 rows
                                                link location  \
0  https://sa.indeed.com/rc/clk?jk=02e091c1362581...   Dammam   
1  https://sa.indeed.com/rc/clk?jk=3e401a8fae9527...   Dammam   
2  https://sa.indeed.com/rc/clk?jk=1d4c0afb83c80a...   Dammam   
3  https://sa.indeed.com/rc/clk?jk=e8013b5fc20445...   Dammam   
4  https://sa.indeed.com/rc/clk?jk=939bb390f05510...   Je

In [10]:
full_df.tail(10)

Unnamed: 0,link,location,title,company,salary,desc,category,cleaned_desc
126,https://sa.indeed.com/rc/clk?jk=825b2c2507fd95...,Riyadh,Java Developer with Spring experience,Skyline Dynamics,,We are looking for a Java developer with the f...,javadeveloper,looking java developer following skills 4 6 ye...
127,https://sa.indeed.com/company/DITRC/jobs/Odoo-...,Riyadh,Odoo /Python developer,DITRC,$Ÿ¶Ÿ¨Ÿ†Ÿ†Ÿ† ŸÑŸÉŸÑ ÿ¥Ÿáÿ±,We need to hire Odoo /python developer with 5+...,javadeveloper,need hire odoo python developer 5 years experi...
132,https://sa.indeed.com/rc/clk?jk=97d0b0cc657202...,Riyadh,Senior Java Developer,2Soft Solutions,,We are hiring for one of our Information Techn...,javadeveloper,hiring one information technology services cli...
133,https://sa.indeed.com/rc/clk?jk=b4d41df0445355...,Riyadh,Android Development Teaching Assistant (Onsite...,CODING DOJO INC,,This is an onsite position to teach students i...,javadeveloper,onsite position teach students riyadh saudi ar...
134,https://sa.indeed.com/rc/clk?jk=7357256142f2a2...,Riyadh,"Solutions Architect AppDev- Riyadh, KSA",redhat,,About the job\nThe Red Hat Commercial Sales te...,javadeveloper,job red hat commercial sales team looking comm...
137,https://sa.indeed.com/rc/clk?jk=2850b8818d6bb2...,Riyadh,Developer,Encore Theme,,"Riyadh, Saudi Arabia\nTech Hiring\n2858154\nJo...",javadeveloper,riyadh saudi arabia tech hiring 2858154 job de...
138,https://sa.indeed.com/rc/clk?jk=f81b5f1c6e84ac...,Riyadh,Android Development Associate Instructor (Onsi...,CODING DOJO INC,,This is an onsite position to teach students i...,javadeveloper,onsite position teach students riyadh saudi ar...
139,https://sa.indeed.com/rc/clk?jk=7959b5d00feff1...,Riyadh,Pega Senior System Architect,Luxoft,,Project Description\nLuxoft is building a team...,javadeveloper,project description luxoft building team pega ...
140,https://sa.indeed.com/rc/clk?jk=97e4b19d63954c...,Riyadh,Senior BackEnd Developer - Python,Zid,,Company Description\n\nWho we are?\n\nBecome a...,backenddeveloper,company description become zider tech member j...
141,https://sa.indeed.com/rc/clk?jk=553f2002ecb39f...,Riyadh,Senior Backend Developer,Professional Recruitment,,Responsibilities:\nExperienced backend develop...,backenddeveloper,responsibilities experienced backend developer...


In [12]:
full_df.head(10)

Unnamed: 0,link,location,title,company,salary,desc,category,cleaned_desc
0,https://sa.indeed.com/rc/clk?jk=02e091c1362581...,Dammam,Chemist,Element Materials Technology,,Overview:\nElement has an opportunity for a Ch...,datascientist,overview element opportunity chemistry technic...
1,https://sa.indeed.com/rc/clk?jk=3e401a8fae9527...,Dammam,Data Scientist,Halian,,Our Client\nWe are partnered with one of the l...,datascientist,client partnered one largest technology organi...
2,https://sa.indeed.com/rc/clk?jk=1d4c0afb83c80a...,Dammam,Research Scientist,King Fahd University of Petroleum & Minerals,,The Applied Research Center for Environment & ...,datascientist,applied research center environment marine stu...
3,https://sa.indeed.com/rc/clk?jk=e8013b5fc20445...,Dammam,Administrator,Element Materials Technology,,Overview:\nElement has an opportunity for a Ad...,datascientist,overview element opportunity administrator joi...
4,https://sa.indeed.com/rc/clk?jk=939bb390f05510...,Jeddah,Data Scientist,Salla,,We are looking for a Data Scientist to design ...,datascientist,looking data scientist design develop machine ...
5,https://sa.indeed.com/rc/clk?jk=341a7df5a4122d...,Jeddah,Data Scientist,Halian,,Our Client\nWe are partnered with one of the l...,datascientist,client partnered one largest technology organi...
6,https://sa.indeed.com/rc/clk?jk=fb99a740b741d4...,Jeddah,Senior Construction Manager,Scientific Research Corporation,,The new construction of site improvements and ...,datascientist,new construction site improvements several hou...
7,https://sa.indeed.com/rc/clk?jk=8c9f296de49be1...,Jeddah,Senior Environmental scientist,AECOM,,As a KSA Intermediate Terrestrial Ecologist II...,datascientist,ksa intermediate terrestrial ecologist iii cor...
8,https://sa.indeed.com/rc/clk?jk=2abff331d63906...,Riyadh,Secretary,Worley,,Company : Worley\nPrimary Location\n: SAU-ARD-...,datascientist,company worley primary location sau ard riyadh...
9,https://sa.indeed.com/rc/clk?jk=6324c7847644c4...,Riyadh,Project Engineer,Worley,,Company : Worley\nPrimary Location\n: SAU-ARD-...,datascientist,company worley primary location sau ard riyadh...


- using richer dataset!
- Collect Persian (Multilingual language)


In [9]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

# 1. Download necessary NLTK resources
# (If you are running this offline, you might need to download these manually once)
nltk.download('stopwords')

# 2. Define the cleaning function
def clean_text(text):
    """
    Applies the following pre-processing steps:
    1. Lowercasing
    2. Removing HTML tags
    3. Removing non-ASCII characters
    4. Substitution (removing newlines/tabs)
    5. Removing punctuation
    6. Removing stop words
    """
    if not isinstance(text, str):
        return ""
    
    # Lowercasing
    text = text.lower()
    
    # Removing HTML tags (Regex to find <...>)
    text = re.sub(r'<.*?>', '', text)
    
    # Removing non-ASCII characters (Keep only standard characters)
    # This removes emojis or weird formatting characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    
    # Substitution (Replace newlines \n and tabs \t with a single space)
    text = re.sub(r'[\r\n\t]+', ' ', text)
    
    # Removing Punctuation
    # We replace punctuation with spaces to avoid merging words (e.g. "hello/world" -> "hello world")
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    
    # F. Removing Stop Words
    stop_words = set(stopwords.words('english'))
    # Tokenize by splitting on whitespace
    words = text.split()
    # Filter out stop words
    filtered_words = [w for w in words if w not in stop_words]
    
    # Join back into a single string
    return " ".join(filtered_words)

# 3. Apply the function to the 'desc' column
print("üßπ Starting text cleaning process...")

# We create a NEW column 'cleaned_desc' to preserve the original data for comparison
full_df['cleaned_desc'] = full_df['desc'].apply(clean_text)

print("‚úÖ Text cleaning completed.")

# 4. Verify the results (Compare Before vs After)
print("\n" + "="*50)
print("üîç Comparison: Raw vs Cleaned")
print("="*50)

# displaying the first valid entry
sample_row = full_df.iloc[0]
print(f"--- ORIGINAL DESC ---\n{sample_row['desc'][:300]}...") # Show first 300 chars
print(f"\n--- CLEANED DESC ---\n{sample_row['cleaned_desc'][:300]}...")


üßπ Starting text cleaning process...
‚úÖ Text cleaning completed.

üîç Comparison: Raw vs Cleaned
--- ORIGINAL DESC ---
Overview:
Element has an opportunity for a Chemistry Technician for testing, sample preparation and delivery/collection of samples and preparation of test reports

This position will be based in Dammam , KSA

Responsibilities:
Analysis of water, soil, aggregates, concrete and scale samples ,operatin...

--- CLEANED DESC ---
overview element opportunity chemistry technician testing sample preparation delivery collection samples preparation test reports position based dammam ksa responsibilities analysis water soil aggregates concrete scale samples operating equipments like ph ec tds meters uv visible spectrophotometers ...


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch, compounding

# DEFINE THE TRAINING DATA
# In a real scenario, this list would contain thousands of annotated examples from a tool like Dataturks.
# Format: (Sentence, {'entities': [(Start_Index, End_Index, LABEL)]})
TRAIN_DATA = [
    ("We need a Senior Java Developer with AWS experience.", {"entities": [(17, 21, "SKILL"), (37, 40, "SKILL"), (10, 21, "ROLE")]}),
    ("Looking for a Data Scientist proficient in Python and SQL.", {"entities": [(14, 28, "ROLE"), (43, 49, "SKILL"), (54, 57, "SKILL")]}),
    ("Must have a Bachelor degree in Computer Science.", {"entities": [(12, 27, "DEGREE"), (31, 47, "MAJOR")]}),
    ("Experience with Machine Learning and Deep Learning is a plus.", {"entities": [(16, 32, "SKILL"), (37, 50, "SKILL")]}),
    ("We are hiring a PHP Developer for our backend team.", {"entities": [(16, 29, "ROLE"), (38, 45, "SKILL")]}),
    ("Knowledge of React, Node.js and MongoDB required.", {"entities": [(13, 18, "SKILL"), (20, 27, "SKILL"), (32, 39, "SKILL")]}),
    ("Masters in Artificial Intelligence preferred.", {"entities": [(0, 7, "DEGREE"), (11, 34, "MAJOR")]}),
    ("Strong background in C++ and Java.", {"entities": [(21, 24, "SKILL"), (29, 33, "SKILL")]}),
    ("Junior Backend Engineer needed immediately.", {"entities": [(0, 23, "ROLE")]}),
    ("Proficiency in Tableau and PowerBI.", {"entities": [(15, 22, "SKILL"), (27, 34, "SKILL")]})
]

print("initializing Custom NER Training...")

# 2. INITIALIZE THE BLANK MODEL
# We create a blank English model to avoid bias from pre-trained entities
nlp = spacy.blank("en")

# Create the NER (Named Entity Recognition) pipeline stage
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add the custom labels to the pipeline
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

print(f"üîπ Pipes in model: {nlp.pipe_names}")
print(f"üîπ Labels added: {ner.labels}")

# 3. THE TRAINING LOOP
# We disable other pipeline components (if any) to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    
    print("\nRunning training epochs (Teaching the model)...")
    
    # Loop for 20 iterations (Epochs)
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        # Create batches of data
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        
        for batch in batches:
            texts, annotations = zip(*batch)
            
            # Create Example objects (required for spaCy v3)
            example_batch = []
            for text, ann in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, ann)
                example_batch.append(example)
            
            # Update the model
            nlp.update(
                example_batch,
                drop=0.5,  # Dropout to prevent overfitting
                losses=losses,
            )
        
        if (itn + 1) % 5 == 0:
            print(f"   Epoch {itn + 1}: Loss = {losses['ner']:.2f}")

print("‚úÖ Training Completed.")

# 4. APPLY TO OUR DATASET
# Now we use this 'smart' model on our actual 'cleaned_desc' column

def extract_entities(text):
    if not isinstance(text, str):
        return []
    
    doc = nlp(text)
    # Return a list of (Entity Text, Label)
    return [(ent.text, ent.label_) for ent in doc.ents]

print("\nüîç Applying NER to the Job Dataset...")

# Create a new column 'extracted_entities'
full_df['extracted_entities'] = full_df['desc'].apply(extract_entities)

# 5. INSPECT RESULTS
print("\n" + "="*50)
print("SAMPLE EXTRACTIONS")
print("="*50)
# Show 5 random rows with their extracted entities
sample_results = full_df[['category', 'extracted_entities']].head(10)
print(sample_results)

# (Optional) Check if we found any entities at all
count_found = full_df['extracted_entities'].apply(lambda x: len(x)).sum()
print(f"\nTotal entities found in first 5 rows (approx): {count_found}")
