In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/job-recom-dataset/datascientist.json
/kaggle/input/job-recom-dataset/dataengineer.json
/kaggle/input/job-recom-dataset/phpdeveloper.json
/kaggle/input/job-recom-dataset/javadeveloper.json
/kaggle/input/job-recom-dataset/backenddeveloper.json


In [2]:
import pandas as pd
import json
import os

# 1. Define file paths
file_paths = [
    '/kaggle/input/job-recom-dataset/datascientist.json',
    '/kaggle/input/job-recom-dataset/dataengineer.json',
    '/kaggle/input/job-recom-dataset/phpdeveloper.json',
    '/kaggle/input/job-recom-dataset/javadeveloper.json',
    '/kaggle/input/job-recom-dataset/backenddeveloper.json'
]

all_dataframes = []

print("üöÄ Starting file processing...")

for path in file_paths:
    # Extract category name from filename (e.g., 'datascientist')
    filename = os.path.basename(path).replace('.json', '')
    print(f"\nReading file: {filename} ...")
    
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # --- Intelligent Data Type Detection ---
        temp_df = None
        
        # Case 1: Data is a list of dictionaries [{}, {}, ...]
        if isinstance(data, list):
            print(f"   Data type: List - Items: {len(data)}")
            temp_df = pd.DataFrame(data)
            
        # Case 2: Data is a dictionary {"0": {}, "1": {}}
        elif isinstance(data, dict):
            print(f"   Data type: Dict - Keys: {len(data)}")
            
            # Check for nested 'root' structure if applicable
            if "root" in data: 
                 temp_df = pd.DataFrame(data['root'])
            else:
                 # Standard case: keys are indices, use orient='index'
                 temp_df = pd.DataFrame.from_dict(data, orient='index')
        
        else:
            print(f"   Unknown format: {type(data)}")
            continue

        # --- Post-processing the single DataFrame ---
        if temp_df is not None and not temp_df.empty:
            # Add a 'category' column to track the source
            temp_df['category'] = filename 
            
            # Reset index (prevents the "0", "1" keys from becoming a messy column)
            temp_df = temp_df.reset_index(drop=True)
            
            all_dataframes.append(temp_df)
            print(f"    Success! DataFrame shape: {temp_df.shape}")
        else:
            print("   DataFrame is empty or could not be created.")

    except Exception as e:
        print(f"   Error processing {filename}: {str(e)}")

# --- Final Consolidation ---
if all_dataframes:
    # Merge all dataframes into one
    full_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Remove duplicates based on job description
    before_dedup = len(full_df)
    full_df = full_df.drop_duplicates(subset=['desc'])
    after_dedup = len(full_df)
    
    print("\n" + "="*40)
    print(f" Operation Complete! {before_dedup - after_dedup} duplicates removed.")
    print(f"üìäFinal dataset size: {after_dedup} rows")
    print("="*40)
    
    # Display the first few rows
    print(full_df.head())
else:
    print("\n‚ùå No data available to merge.")


üöÄ Starting file processing...

Reading file: datascientist ...
   Data type: List - Items: 32
    Success! DataFrame shape: (32, 7)

Reading file: dataengineer ...
   Data type: List - Items: 80
    Success! DataFrame shape: (80, 7)

Reading file: phpdeveloper ...
   Data type: List - Items: 8
    Success! DataFrame shape: (8, 7)

Reading file: javadeveloper ...
   Data type: List - Items: 20
    Success! DataFrame shape: (20, 7)

Reading file: backenddeveloper ...
   Data type: List - Items: 5
    Success! DataFrame shape: (5, 7)

 Operation Complete! 31 duplicates removed.
üìäFinal dataset size: 114 rows
                                                link location  \
0  https://sa.indeed.com/rc/clk?jk=02e091c1362581...   Dammam   
1  https://sa.indeed.com/rc/clk?jk=3e401a8fae9527...   Dammam   
2  https://sa.indeed.com/rc/clk?jk=1d4c0afb83c80a...   Dammam   
3  https://sa.indeed.com/rc/clk?jk=e8013b5fc20445...   Dammam   
4  https://sa.indeed.com/rc/clk?jk=939bb390f05510...   Je

In [3]:
full_df.tail(40)

Unnamed: 0,link,location,title,company,salary,desc,category
88,https://sa.indeed.com/rc/clk?jk=712c9a24040a41...,Riyadh,Customer Engineer,NCR,,About NCR\nNCR Corporation (NYSE: NCR) is a le...,dataengineer
89,https://sa.indeed.com/company/Arail-Constructi...,Riyadh,Electrical Technical Engineer,Arail Construction & Industrial Co. Ltd,,Job descriptions:\nCooperate with the construc...,dataengineer
91,https://sa.indeed.com/rc/clk?jk=2e7312f3bd9546...,Riyadh,IP Presales Engineer,NOKIA,,Come create the technology that helps the worl...,dataengineer
93,https://sa.indeed.com/rc/clk?jk=3e97a209470be7...,Riyadh,Senior Testing & Commissioning Engineer,Hitachi Energy,,Hitachi Energy is a pioneering technology lead...,dataengineer
94,https://sa.indeed.com/rc/clk?jk=62bcfc8064a57e...,Riyadh,System Business Analyst,Zid,,Company Description\n\nWho we are?\n\nBecome a...,dataengineer
95,https://sa.indeed.com/rc/clk?jk=d273f20337350a...,Riyadh,Network Engineer Lead,Lucid Motors,,Leading the future in luxury electric and mobi...,dataengineer
96,https://sa.indeed.com/rc/clk?jk=3112dc4f5fef43...,Riyadh,Chartered Mechanical Engineer,Rider Levett Bucknall,,Title: Chartered Mechanical Engineer\nDivision...,dataengineer
98,https://sa.indeed.com/rc/clk?jk=22f7eff5dfbeef...,Riyadh,Cost Engineer(Saudi only),Parsons,,Job Description:\nParsons is looking for an am...,dataengineer
99,https://sa.indeed.com/rc/clk?jk=9db6f831bae2a1...,Riyadh,Lead Infrastructure / Utilities,Parsons,,Job Description:\nLead Infrastructure / Utilit...,dataengineer
100,https://sa.indeed.com/rc/clk?jk=a835db12d924b9...,Riyadh,Condition Monitoring Engineer,Nomac,,Job Summary\nTo provide a specialized engineer...,dataengineer
