In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/job-recom-dataset/datascientist.json
/kaggle/input/job-recom-dataset/dataengineer.json
/kaggle/input/job-recom-dataset/phpdeveloper.json
/kaggle/input/job-recom-dataset/javadeveloper.json
/kaggle/input/job-recom-dataset/backenddeveloper.json


In [2]:
import pandas as pd
import json
import os

# 1. Define file paths
file_paths = [
    '/kaggle/input/job-recom-dataset/datascientist.json',
    '/kaggle/input/job-recom-dataset/dataengineer.json',
    '/kaggle/input/job-recom-dataset/phpdeveloper.json',
    '/kaggle/input/job-recom-dataset/javadeveloper.json',
    '/kaggle/input/job-recom-dataset/backenddeveloper.json'
]

all_dataframes = []

print("üöÄ Starting file processing...")

for path in file_paths:
    # Extract category name from filename (e.g., 'datascientist')
    filename = os.path.basename(path).replace('.json', '')
    print(f"\nReading file: {filename} ...")
    
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # --- Intelligent Data Type Detection ---
        temp_df = None
        
        # Case 1: Data is a list of dictionaries [{}, {}, ...]
        if isinstance(data, list):
            print(f"   Data type: List - Items: {len(data)}")
            temp_df = pd.DataFrame(data)
            
        # Case 2: Data is a dictionary {"0": {}, "1": {}}
        elif isinstance(data, dict):
            print(f"   Data type: Dict - Keys: {len(data)}")
            
            # Check for nested 'root' structure if applicable
            if "root" in data: 
                 temp_df = pd.DataFrame(data['root'])
            else:
                 # Standard case: keys are indices, use orient='index'
                 temp_df = pd.DataFrame.from_dict(data, orient='index')
        
        else:
            print(f"   Unknown format: {type(data)}")
            continue

        # --- Post-processing the single DataFrame ---
        if temp_df is not None and not temp_df.empty:
            # Add a 'category' column to track the source
            temp_df['category'] = filename 
            
            # Reset index (prevents the "0", "1" keys from becoming a messy column)
            temp_df = temp_df.reset_index(drop=True)
            
            all_dataframes.append(temp_df)
            print(f"    Success! DataFrame shape: {temp_df.shape}")
        else:
            print("   DataFrame is empty or could not be created.")

    except Exception as e:
        print(f"   Error processing {filename}: {str(e)}")

# --- Final Consolidation ---
if all_dataframes:
    # Merge all dataframes into one
    full_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Remove duplicates based on job description
    before_dedup = len(full_df)
    full_df = full_df.drop_duplicates(subset=['desc'])
    after_dedup = len(full_df)
    
    print("\n" + "="*40)
    print(f" Operation Complete! {before_dedup - after_dedup} duplicates removed.")
    print(f"üìäFinal dataset size: {after_dedup} rows")
    print("="*40)
    
    # Display the first few rows
    print(full_df.head())
else:
    print("\n‚ùå No data available to merge.")


üöÄ Starting file processing...

Reading file: datascientist ...
   Data type: List - Items: 32
    Success! DataFrame shape: (32, 7)

Reading file: dataengineer ...
   Data type: List - Items: 80
    Success! DataFrame shape: (80, 7)

Reading file: phpdeveloper ...
   Data type: List - Items: 8
    Success! DataFrame shape: (8, 7)

Reading file: javadeveloper ...
   Data type: List - Items: 20
    Success! DataFrame shape: (20, 7)

Reading file: backenddeveloper ...
   Data type: List - Items: 5
    Success! DataFrame shape: (5, 7)

 Operation Complete! 31 duplicates removed.
üìäFinal dataset size: 114 rows
                                                link location  \
0  https://sa.indeed.com/rc/clk?jk=02e091c1362581...   Dammam   
1  https://sa.indeed.com/rc/clk?jk=3e401a8fae9527...   Dammam   
2  https://sa.indeed.com/rc/clk?jk=1d4c0afb83c80a...   Dammam   
3  https://sa.indeed.com/rc/clk?jk=e8013b5fc20445...   Dammam   
4  https://sa.indeed.com/rc/clk?jk=939bb390f05510...   Je

In [3]:
full_df.tail(10)

Unnamed: 0,link,location,title,company,salary,desc,category
126,https://sa.indeed.com/rc/clk?jk=825b2c2507fd95...,Riyadh,Java Developer with Spring experience,Skyline Dynamics,,We are looking for a Java developer with the f...,javadeveloper
127,https://sa.indeed.com/company/DITRC/jobs/Odoo-...,Riyadh,Odoo /Python developer,DITRC,$Ÿ¶Ÿ¨Ÿ†Ÿ†Ÿ† ŸÑŸÉŸÑ ÿ¥Ÿáÿ±,We need to hire Odoo /python developer with 5+...,javadeveloper
132,https://sa.indeed.com/rc/clk?jk=97d0b0cc657202...,Riyadh,Senior Java Developer,2Soft Solutions,,We are hiring for one of our Information Techn...,javadeveloper
133,https://sa.indeed.com/rc/clk?jk=b4d41df0445355...,Riyadh,Android Development Teaching Assistant (Onsite...,CODING DOJO INC,,This is an onsite position to teach students i...,javadeveloper
134,https://sa.indeed.com/rc/clk?jk=7357256142f2a2...,Riyadh,"Solutions Architect AppDev- Riyadh, KSA",redhat,,About the job\nThe Red Hat Commercial Sales te...,javadeveloper
137,https://sa.indeed.com/rc/clk?jk=2850b8818d6bb2...,Riyadh,Developer,Encore Theme,,"Riyadh, Saudi Arabia\nTech Hiring\n2858154\nJo...",javadeveloper
138,https://sa.indeed.com/rc/clk?jk=f81b5f1c6e84ac...,Riyadh,Android Development Associate Instructor (Onsi...,CODING DOJO INC,,This is an onsite position to teach students i...,javadeveloper
139,https://sa.indeed.com/rc/clk?jk=7959b5d00feff1...,Riyadh,Pega Senior System Architect,Luxoft,,Project Description\nLuxoft is building a team...,javadeveloper
140,https://sa.indeed.com/rc/clk?jk=97e4b19d63954c...,Riyadh,Senior BackEnd Developer - Python,Zid,,Company Description\n\nWho we are?\n\nBecome a...,backenddeveloper
141,https://sa.indeed.com/rc/clk?jk=553f2002ecb39f...,Riyadh,Senior Backend Developer,Professional Recruitment,,Responsibilities:\nExperienced backend develop...,backenddeveloper


In [4]:
full_df.head(10)

Unnamed: 0,link,location,title,company,salary,desc,category
0,https://sa.indeed.com/rc/clk?jk=02e091c1362581...,Dammam,Chemist,Element Materials Technology,,Overview:\nElement has an opportunity for a Ch...,datascientist
1,https://sa.indeed.com/rc/clk?jk=3e401a8fae9527...,Dammam,Data Scientist,Halian,,Our Client\nWe are partnered with one of the l...,datascientist
2,https://sa.indeed.com/rc/clk?jk=1d4c0afb83c80a...,Dammam,Research Scientist,King Fahd University of Petroleum & Minerals,,The Applied Research Center for Environment & ...,datascientist
3,https://sa.indeed.com/rc/clk?jk=e8013b5fc20445...,Dammam,Administrator,Element Materials Technology,,Overview:\nElement has an opportunity for a Ad...,datascientist
4,https://sa.indeed.com/rc/clk?jk=939bb390f05510...,Jeddah,Data Scientist,Salla,,We are looking for a Data Scientist to design ...,datascientist
5,https://sa.indeed.com/rc/clk?jk=341a7df5a4122d...,Jeddah,Data Scientist,Halian,,Our Client\nWe are partnered with one of the l...,datascientist
6,https://sa.indeed.com/rc/clk?jk=fb99a740b741d4...,Jeddah,Senior Construction Manager,Scientific Research Corporation,,The new construction of site improvements and ...,datascientist
7,https://sa.indeed.com/rc/clk?jk=8c9f296de49be1...,Jeddah,Senior Environmental scientist,AECOM,,As a KSA Intermediate Terrestrial Ecologist II...,datascientist
8,https://sa.indeed.com/rc/clk?jk=2abff331d63906...,Riyadh,Secretary,Worley,,Company : Worley\nPrimary Location\n: SAU-ARD-...,datascientist
9,https://sa.indeed.com/rc/clk?jk=6324c7847644c4...,Riyadh,Project Engineer,Worley,,Company : Worley\nPrimary Location\n: SAU-ARD-...,datascientist


- using richer dataset!
- Collect Persian (Multilingual language)


In [5]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

# 1. Download necessary NLTK resources
# (If you are running this offline, you might need to download these manually once)
nltk.download('stopwords')

# 2. Define the cleaning function
def clean_text(text):
    """
    Applies the following pre-processing steps:
    1. Lowercasing
    2. Removing HTML tags
    3. Removing non-ASCII characters
    4. Substitution (removing newlines/tabs)
    5. Removing punctuation
    6. Removing stop words
    """
    if not isinstance(text, str):
        return ""
    
    # Lowercasing
    text = text.lower()
    
    # Removing HTML tags (Regex to find <...>)
    text = re.sub(r'<.*?>', '', text)
    
    # Removing non-ASCII characters (Keep only standard characters)
    # This removes emojis or weird formatting characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    
    # Substitution (Replace newlines \n and tabs \t with a single space)
    text = re.sub(r'[\r\n\t]+', ' ', text)
    
    # Removing Punctuation
    # We replace punctuation with spaces to avoid merging words (e.g. "hello/world" -> "hello world")
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    
    # F. Removing Stop Words
    stop_words = set(stopwords.words('english'))
    # Tokenize by splitting on whitespace
    words = text.split()
    # Filter out stop words
    filtered_words = [w for w in words if w not in stop_words]
    
    # Join back into a single string
    return " ".join(filtered_words)

# 3. Apply the function to the 'desc' column
print("üßπ Starting text cleaning process...")

# We create a NEW column 'cleaned_desc' to preserve the original data for comparison
full_df['cleaned_desc'] = full_df['desc'].apply(clean_text)

print("‚úÖ Text cleaning completed.")

# 4. Verify the results (Compare Before vs After)
print("\n" + "="*50)
print("üîç Comparison: Raw vs Cleaned")
print("="*50)

# displaying the first valid entry
sample_row = full_df.iloc[0]
print(f"--- ORIGINAL DESC ---\n{sample_row['desc'][:300]}...") # Show first 300 chars
print(f"\n--- CLEANED DESC ---\n{sample_row['cleaned_desc'][:300]}...")


üßπ Starting text cleaning process...
‚úÖ Text cleaning completed.

üîç Comparison: Raw vs Cleaned
--- ORIGINAL DESC ---
Overview:
Element has an opportunity for a Chemistry Technician for testing, sample preparation and delivery/collection of samples and preparation of test reports

This position will be based in Dammam , KSA

Responsibilities:
Analysis of water, soil, aggregates, concrete and scale samples ,operatin...

--- CLEANED DESC ---
overview element opportunity chemistry technician testing sample preparation delivery collection samples preparation test reports position based dammam ksa responsibilities analysis water soil aggregates concrete scale samples operating equipments like ph ec tds meters uv visible spectrophotometers ...


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import spacy
from spacy.pipeline import EntityRuler
import pandas as pd

# 1. INITIALIZE A PRE-TRAINED MODEL
# instead of spacy.blank("en"), we load the small English model
# This helps because it already handles tokenization perfectly.
try:
    nlp = spacy.load("en_core_web_sm")
except:
    # Fallback if model isn't downloaded
    print("Downloading model...")
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

print("‚úÖ Base Model Loaded.")

# 2. DEFINE YOUR KNOWLEDGE BASE (The "Rules")
# We explicitly tell the AI what skills and roles look like.
# In a real production system, this list comes from a database.
skill_patterns = [
    # Programming Languages
    {"label": "SKILL", "pattern": [{"LOWER": "python"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "java"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "php"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "c++"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "sql"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "javascript"}]},
    
    # Frameworks & Tools
    {"label": "SKILL", "pattern": [{"LOWER": "aws"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "react"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "django"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "laravel"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "docker"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "kubernetes"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "tensorflow"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "pytorch"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "pandas"}]},

    # Concepts
    {"label": "SKILL", "pattern": [{"LOWER": "machine"}, {"LOWER": "learning"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "deep"}, {"LOWER": "learning"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "data"}, {"LOWER": "science"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "big"}, {"LOWER": "data"}]},
]

role_patterns = [
    {"label": "ROLE", "pattern": [{"LOWER": "data"}, {"LOWER": "scientist"}]},
    {"label": "ROLE", "pattern": [{"LOWER": "software"}, {"LOWER": "engineer"}]},
    {"label": "ROLE", "pattern": [{"LOWER": "backend"}, {"LOWER": "developer"}]},
    {"label": "ROLE", "pattern": [{"LOWER": "frontend"}, {"LOWER": "developer"}]},
    {"label": "ROLE", "pattern": [{"LOWER": "php"}, {"LOWER": "developer"}]},
    {"label": "ROLE", "pattern": [{"LOWER": "java"}, {"LOWER": "developer"}]},
]

degree_patterns = [
    {"label": "DEGREE", "pattern": [{"LOWER": "bachelor"}]},
    {"label": "DEGREE", "pattern": [{"LOWER": "master"}]},
    {"label": "DEGREE", "pattern": [{"LOWER": "phd"}]},
    {"label": "DEGREE", "pattern": [{"LOWER": "computer"}, {"LOWER": "science"}]},
]

# Combine all patterns
all_patterns = skill_patterns + role_patterns + degree_patterns

# 3. ADD THE RULER TO THE PIPELINE
# This inserts our rules before the standard NER, giving them priority.
if "entity_ruler" not in nlp.pipe_names:
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(all_patterns)

print("‚úÖ Custom Rules Applied to Pipeline.")

# 4. APPLY TO DATASET
def extract_clean_entities(text):
    if not isinstance(text, str):
        return []
    
    # Process the text
    doc = nlp(text)
    
    # Filter: We only want entities that match OUR labels (SKILL, ROLE, DEGREE)
    # We ignore standard spaCy labels like "DATE" or "ORG" to reduce noise
    target_labels = ["SKILL", "ROLE", "DEGREE"]
    
    results = []
    for ent in doc.ents:
        if ent.label_ in target_labels:
            results.append((ent.text, ent.label_))
            
    return list(set(results)) # set() removes duplicates like ('Java', 'SKILL') appearing twice

print("\nüîç Extracting Entities from Job Descriptions...")
full_df['extracted_entities'] = full_df['cleaned_desc'].apply(extract_clean_entities)

# 5. VIEW VALIDATED RESULTS
print("\n" + "="*50)
print("VALIDATED ENTITY EXTRACTION RESULTS")
print("="*50)

# Show rows that actually have entities found
sample_rows = full_df[full_df['extracted_entities'].map(len) > 0].head(10)

for index, row in sample_rows.iterrows():
    print(f"CATEGORY: {row['category']}")
    print(f"FOUND:    {row['extracted_entities']}")
    print("-" * 30)

# Calculate Stats for the Paper
all_skills = [ent[0] for sublist in full_df['extracted_entities'] for ent in sublist if ent[1] == 'SKILL']
print(f"\nüìä STATS:")
print(f"Total Skills Identified: {len(all_skills)}")
print(f"Most Common Skills: {pd.Series(all_skills).value_counts().head(5).to_dict()}")


‚úÖ Base Model Loaded.
‚úÖ Custom Rules Applied to Pipeline.

üîç Extracting Entities from Job Descriptions...

VALIDATED ENTITY EXTRACTION RESULTS
CATEGORY: datascientist
FOUND:    [('data science', 'SKILL'), ('data scientist', 'ROLE'), ('python', 'SKILL'), ('phd', 'DEGREE'), ('sql', 'SKILL')]
------------------------------
CATEGORY: datascientist
FOUND:    [('python', 'SKILL')]
------------------------------
CATEGORY: datascientist
FOUND:    [('bachelor', 'DEGREE')]
------------------------------
CATEGORY: datascientist
FOUND:    [('data science', 'SKILL'), ('big data', 'SKILL'), ('tensorflow', 'SKILL'), ('data scientist', 'ROLE'), ('machine learning', 'SKILL'), ('deep learning', 'SKILL'), ('pandas', 'SKILL'), ('python', 'SKILL'), ('bachelor', 'DEGREE'), ('java', 'SKILL'), ('computer science', 'DEGREE')]
------------------------------
CATEGORY: datascientist
FOUND:    [('data science', 'SKILL'), ('data scientist', 'ROLE'), ('python', 'SKILL'), ('phd', 'DEGREE'), ('sql', 'SKILL')]
--