In [None]:
import pandas as pd
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
from google.colab import drive

# --- 1. Mount Google Drive ---
# This will prompt you for authorization.
try:
    drive.mount('/content/drive')
    print("✅ Google Drive mounted successfully.")
except Exception as e:
    print(f"❌ Error mounting Google Drive: {e}")
    exit()

# --- 2. Load and Prepare the Data ---
try:
    df = pd.read_csv('qp_listings_new_30sep.csv')
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("❌ Error: 'qp_listings_new_30sep.csv' not found. Please upload the file.")
    exit()

# Function to safely extract the first sector ID
def extract_sector_id(sector_str):
    if not isinstance(sector_str, str):
        return None
    match = re.search(r'"sectorID":\s*"(\d+)"', sector_str)
    if match:
        return match.group(1)
    return None

df['sector_id'] = df['sectors'].apply(extract_sector_id)
df.dropna(subset=['sector_id', 'jobRole', 'jobRoleDesc'], inplace=True)
df['sector_id'] = df['sector_id'].astype(str)
print("\n📊 Sector counts:")
print(df['sector_id'].value_counts())

df['text_for_clustering'] = df['jobRole'].str.strip() + ' ' + df['jobRoleDesc'].str.strip()

# --- 3. Set up Output Directory in Google Drive ---
output_folder = '/content/drive/My Drive/Job_Cluster_Graphs'
os.makedirs(output_folder, exist_ok=True)
print(f"\n📁 Interactive graphs will be saved to: '{output_folder}'")

# --- 4. Perform Clustering and Generate Interactive Plots ---
unique_sectors = df['sector_id'].unique()
print(f"\n⚙️ Found {len(unique_sectors)} unique sectors to process...")

for sector in unique_sectors:
    print(f"\n--- Processing Sector ID: {sector} ---")
    sector_df = df[df['sector_id'] == sector].copy()

    N_CLUSTERS = 3
    if len(sector_df) < N_CLUSTERS:
        print(f"Skipping Sector ID {sector}: Not enough job roles ({len(sector_df)}).")
        continue

    # Vectorize text
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(sector_df['text_for_clustering'])

    # K-Means Clustering
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
    sector_df['cluster'] = kmeans.fit_predict(X)

    # PCA for dimensionality reduction
    pca = PCA(n_components=2, random_state=42)
    reduced_features = pca.fit_transform(X.toarray())
    sector_df['pca1'] = reduced_features[:, 0]
    sector_df['pca2'] = reduced_features[:, 1]

    # --- Create the Interactive Plot with Plotly ---
    fig = px.scatter(
        sector_df,
        x='pca1',
        y='pca2',
        color='cluster',  # Color points by cluster number
        hover_name='jobRole', # Show jobRole on hover
        title=f'Interactive Job Role Clusters for Sector ID: {sector}',
        labels={'pca1': 'Principal Component 1', 'pca2': 'Principal Component 2'}
    )

    fig.update_traces(marker=dict(size=10, opacity=0.8))

    # Display the plot in the Colab output
    print("Displaying interactive plot...")
    fig.show()

    # Save the plot as an HTML file in Google Drive
    plot_filename = f'sector_{sector}_interactive_clusters.html'
    file_path = os.path.join(output_folder, plot_filename)
    fig.write_html(file_path)
    print(f"✅ Interactive graph saved to Google Drive: '{plot_filename}'")

print("\n--- ✨ All sectors processed. ---")

Mounted at /content/drive
✅ Google Drive mounted successfully.
✅ Successfully loaded the dataset.

📊 Sector counts:
sector_id
13     241
1      224
24     178
11     140
39     123
17     115
36      99
29      80
38      80
26      79
22      79
40      78
31      73
23      70
20      66
16      65
9       62
12      62
6       62
25      55
35      55
33      53
4       48
30      46
10      39
112     39
19      38
37      36
2       32
3       30
21      29
84      23
27      21
32      20
46      17
15      16
18      16
14      16
5       15
111     12
102      5
119      2
101      1
117      1
118      1
Name: count, dtype: int64

📁 Interactive graphs will be saved to: '/content/drive/My Drive/Job_Cluster_Graphs'

⚙️ Found 45 unique sectors to process...

--- Processing Sector ID: 39 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_39_interactive_clusters.html'

--- Processing Sector ID: 32 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_32_interactive_clusters.html'

--- Processing Sector ID: 24 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_24_interactive_clusters.html'

--- Processing Sector ID: 38 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_38_interactive_clusters.html'

--- Processing Sector ID: 23 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_23_interactive_clusters.html'

--- Processing Sector ID: 1 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_1_interactive_clusters.html'

--- Processing Sector ID: 2 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_2_interactive_clusters.html'

--- Processing Sector ID: 33 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_33_interactive_clusters.html'

--- Processing Sector ID: 12 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_12_interactive_clusters.html'

--- Processing Sector ID: 16 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_16_interactive_clusters.html'

--- Processing Sector ID: 26 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_26_interactive_clusters.html'

--- Processing Sector ID: 36 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_36_interactive_clusters.html'

--- Processing Sector ID: 84 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_84_interactive_clusters.html'

--- Processing Sector ID: 22 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_22_interactive_clusters.html'

--- Processing Sector ID: 29 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_29_interactive_clusters.html'

--- Processing Sector ID: 13 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_13_interactive_clusters.html'

--- Processing Sector ID: 31 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_31_interactive_clusters.html'

--- Processing Sector ID: 18 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_18_interactive_clusters.html'

--- Processing Sector ID: 6 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_6_interactive_clusters.html'

--- Processing Sector ID: 11 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_11_interactive_clusters.html'

--- Processing Sector ID: 37 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_37_interactive_clusters.html'

--- Processing Sector ID: 9 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_9_interactive_clusters.html'

--- Processing Sector ID: 3 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_3_interactive_clusters.html'

--- Processing Sector ID: 46 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_46_interactive_clusters.html'

--- Processing Sector ID: 15 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_15_interactive_clusters.html'

--- Processing Sector ID: 17 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_17_interactive_clusters.html'

--- Processing Sector ID: 25 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_25_interactive_clusters.html'

--- Processing Sector ID: 30 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_30_interactive_clusters.html'

--- Processing Sector ID: 35 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_35_interactive_clusters.html'

--- Processing Sector ID: 40 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_40_interactive_clusters.html'

--- Processing Sector ID: 14 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_14_interactive_clusters.html'

--- Processing Sector ID: 19 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_19_interactive_clusters.html'

--- Processing Sector ID: 27 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_27_interactive_clusters.html'

--- Processing Sector ID: 20 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_20_interactive_clusters.html'

--- Processing Sector ID: 5 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_5_interactive_clusters.html'

--- Processing Sector ID: 10 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_10_interactive_clusters.html'

--- Processing Sector ID: 4 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_4_interactive_clusters.html'

--- Processing Sector ID: 21 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_21_interactive_clusters.html'

--- Processing Sector ID: 101 ---
Skipping Sector ID 101: Not enough job roles (1).

--- Processing Sector ID: 102 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_102_interactive_clusters.html'

--- Processing Sector ID: 111 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_111_interactive_clusters.html'

--- Processing Sector ID: 112 ---
Displaying interactive plot...


✅ Interactive graph saved to Google Drive: 'sector_112_interactive_clusters.html'

--- Processing Sector ID: 117 ---
Skipping Sector ID 117: Not enough job roles (1).

--- Processing Sector ID: 118 ---
Skipping Sector ID 118: Not enough job roles (1).

--- Processing Sector ID: 119 ---
Skipping Sector ID 119: Not enough job roles (2).

--- ✨ All sectors processed. ---


Getting Subsectors for all Sectors

In [None]:
# 1. Install necessary libraries
!pip install sentence-transformers pandas scikit-learn numpy

# 2. Import libraries and define the processing function
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def process_sectors_data(df):
    """
    Processes sectors data using a local SentenceTransformer model to match
    job roles to the most relevant subsector.
    """
    # --- Step 1: Parse the 'sectors' column and create the sector-subsector mapping ---
    print("Parsing sectors and creating mapping...")
    sector_subsector_mapping = []
    for _, row in df.iterrows():
        try:
            sector_data = json.loads(row['sectors'])
            sector_id = sector_data.get('sectorID')
            sector_name = sector_data.get('sectorName')
            if 'subSectors' in sector_data and sector_data['subSectors']:
                for subsector in sector_data['subSectors']:
                    sector_subsector_mapping.append({
                        'sector': sector_name, 'sector_id': sector_id,
                        'subsector': subsector.get('subSectorName'), 'subsector_id': subsector.get('subSectorID')
                    })
            else:
                sector_subsector_mapping.append({
                    'sector': sector_name, 'sector_id': sector_id,
                    'subsector': None, 'subsector_id': None
                })
        except (json.JSONDecodeError, AttributeError):
            continue

    mapping_df = pd.DataFrame(sector_subsector_mapping).drop_duplicates().reset_index(drop=True)
    mapping_df.to_csv('sector_subsector_mapping.csv', index=False)
    print("✅ Created 'sector_subsector_mapping.csv'")

    # Create a dictionary for easy lookup of subsectors by sector
    sector_to_subsectors = {
        sector: list(sub_df[['subsector_name', 'subsector_id']].to_dict('records'))
        for sector, sub_df in mapping_df.dropna(subset=['subsector']).rename(
            columns={'subsector': 'subsector_name', 'subsector_id': 'subsector_id'}
        ).groupby('sector')
    }

    # --- Step 2: Load the embedding model ---
    print("Loading the embedding model (this might take a moment)...")
    # This model runs locally in your Colab session
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("✅ Model loaded successfully.")

    def get_best_subsector(job_role, job_desc, sector_name):
        if sector_name not in sector_to_subsectors:
            return None

        subsectors = sector_to_subsectors[sector_name]
        subsector_names = [s['subsector_name'] for s in subsectors]

        # Create embeddings locally
        job_embedding = model.encode([f"Job Role: {job_role}, Description: {job_desc}"])
        subsector_embeddings = model.encode(subsector_names)

        # Find the best match
        similarities = cosine_similarity(job_embedding, subsector_embeddings)
        best_match_index = np.argmax(similarities)
        return subsectors[best_match_index]

    # --- Step 3: Create the final 'sectors_subsectors' column ---
    print("Processing rows to find best sub-sector matches...")
    df['sectors_subsectors'] = ''
    for index, row in df.iterrows():
        try:
            sector_data = json.loads(row['sectors'])
            if 'subSectors' not in sector_data or not sector_data['subSectors']:
                best_subsector = get_best_subsector(row['jobRole'], row['jobRoleDesc'], sector_data.get('sectorName'))
                if best_subsector:
                    sector_data['subSectors'] = [{'subSectorID': best_subsector['subsector_id'], 'subSectorName': best_subsector['subsector_name']}]
            df.at[index, 'sectors_subsectors'] = json.dumps(sector_data)
        except (json.JSONDecodeError, AttributeError):
            continue

    df.to_csv('updated_data.csv', index=False)
    print("✅ Created 'updated_data.csv'")

# 4. Load your data and run the process
try:
    # Use your specified filename
    input_filename = "qp_listings_new_30sep.csv"
    df = pd.read_csv(input_filename)
    print(f"Successfully loaded '{input_filename}'. Starting processing...")
    process_sectors_data(df)
    print("\n🎉 Processing complete!")
except FileNotFoundError:
    print(f"❌ Error: File not found. Please make sure your input file is named '{input_filename}' and is uploaded to your Colab session.")

Successfully loaded 'qp_listings_new_30sep.csv'. Starting processing...
Parsing sectors and creating mapping...
✅ Created 'sector_subsector_mapping.csv'
Loading the embedding model (this might take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully.
Processing rows to find best sub-sector matches...
✅ Created 'updated_data.csv'

🎉 Processing complete!


In [None]:
# 1. Install necessary libraries
# !pip install sentence-transformers pandas scikit-learn numpy

# 2. Import libraries and define the processing function
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def process_sectors_data_updated(df):
    """
    Processes sectors data with updated logic:
    1. Groups sectors by unique sectorID.
    2. Creates a default sub-sector for sectors that have none.
    """
    # --- Step 1: Build a comprehensive map based on unique sectorID ---
    print("Building a comprehensive map based on unique sectorID...")
    sectors_map = {}
    for _, row in df.iterrows():
        try:
            data = json.loads(row['sectors'])
            sector_id = data.get('sectorID')
            if not sector_id:
                continue

            # If sectorID is new, add it to the map with its name
            if sector_id not in sectors_map:
                sectors_map[sector_id] = {
                    'sectorName': data.get('sectorName'),
                    'subSectors': set()
                }

            # Add any sub-sectors from this row to the set (avoids duplicates)
            if 'subSectors' in data and data['subSectors']:
                for sub in data['subSectors']:
                    if sub.get('subSectorID') and sub.get('subSectorName'):
                         # Store as a tuple to make it hashable for the set
                        sectors_map[sector_id]['subSectors'].add(
                            (sub.get('subSectorID'), sub.get('subSectorName'))
                        )
        except (json.JSONDecodeError, AttributeError):
            continue

    # --- Step 2: Create default sub-sectors where needed ---
    print("Creating default sub-sectors for sectors without any...")
    default_sub_id_counter = 100
    for sector_id, details in sectors_map.items():
        if not details['subSectors']: # Check if the sub-sector set is empty
            sector_name = details['sectorName']
            default_sub_id = str(default_sub_id_counter)
            details['subSectors'].add((default_sub_id, sector_name))
            default_sub_id_counter += 1

    # --- Step 3: Create the final mapping CSV from the comprehensive map ---
    sector_subsector_list = []
    for sector_id, details in sectors_map.items():
        for sub_id, sub_name in details['subSectors']:
            sector_subsector_list.append({
                'sector': details['sectorName'], 'sector_id': sector_id,
                'subsector': sub_name, 'subsector_id': sub_id
            })

    mapping_df = pd.DataFrame(sector_subsector_list)
    mapping_df.to_csv('sector_subsector_mapping.csv', index=False)
    print("✅ Created 'sector_subsector_mapping.csv' with new logic.")

    # Create the lookup dictionary for the matching function
    sector_to_subsectors = {
        sid: [{'subsector_name': sub_name, 'subsector_id': sub_id} for sub_id, sub_name in details['subSectors']]
        for sid, details in sectors_map.items()
    }

    # --- Step 4: Load model and perform matching ---
    print("Loading the embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("✅ Model loaded successfully.")

    def get_best_subsector(job_role, job_desc, sector_id):
        if sector_id not in sector_to_subsectors:
            return None

        subsectors = sector_to_subsectors[sector_id]
        subsector_names = [s['subsector_name'] for s in subsectors]

        job_embedding = model.encode([f"Job Role: {job_role}, Description: {job_desc}"])
        subsector_embeddings = model.encode(subsector_names)

        similarities = cosine_similarity(job_embedding, subsector_embeddings)
        best_match_index = np.argmax(similarities)
        return subsectors[best_match_index]

    # --- Step 5: Create the final 'sectors_subsectors' column ---
    print("Processing rows to find best sub-sector matches...")
    df['sectors_subsectors'] = ''
    for index, row in df.iterrows():
        try:
            sector_data = json.loads(row['sectors'])
            sector_id = sector_data.get('sectorID')
            sector_name = sectors_map.get(sector_id, {}).get('sectorName', sector_data.get('sectorName')) # Use consistent name

            # Reconstruct the JSON with the consistent sector name
            final_sector_json = {'sectorID': sector_id, 'sectorName': sector_name}

            if 'subSectors' in sector_data and sector_data['subSectors']:
                 final_sector_json['subSectors'] = sector_data['subSectors']
            else:
                best_subsector = get_best_subsector(row['jobRole'], row['jobRoleDesc'], sector_id)
                if best_subsector:
                    final_sector_json['subSectors'] = [{'subSectorID': best_subsector['subsector_id'], 'subSectorName': best_subsector['subsector_name']}]

            df.at[index, 'sectors_subsectors'] = json.dumps(final_sector_json)
        except (json.JSONDecodeError, AttributeError):
            continue

    df.to_csv('sector_subsector_all.csv', index=False)
    print("✅ Created 'updated_data.csv'")


# --- Main execution block ---
try:
    input_filename = "qp_listings_new_30sep.csv"
    df = pd.read_csv(input_filename)
    print(f"Successfully loaded '{input_filename}'. Starting processing...")
    process_sectors_data_updated(df)
    print("\n🎉 Processing complete!")
except FileNotFoundError:
    print(f"❌ Error: File not found. Please make sure your input file is named '{input_filename}' and is uploaded to your Colab session.")

Successfully loaded 'qp_listings_new_30sep.csv'. Starting processing...
Building a comprehensive map based on unique sectorID...
Creating default sub-sectors for sectors without any...
✅ Created 'sector_subsector_mapping.csv' with new logic.
Loading the embedding model...
✅ Model loaded successfully.
Processing rows to find best sub-sector matches...
✅ Created 'updated_data.csv'

🎉 Processing complete!


Matching with sectors and tags

In [None]:
import pandas as pd

try:
    # Step 1: Load your CSV and Parquet files into pandas DataFrames.
    subsector_categories_df = pd.read_csv("Subsector_categories.csv")
    job_similarity_matrix_df = pd.read_parquet("job_similarity_matrix.parquet")

    # Step 2: Merge to get details for 'subsector_id1'.
    # This joins the two tables where 'subsector_id1' matches 'subsector_id'.
    merged_df = pd.merge(
        job_similarity_matrix_df,
        subsector_categories_df,
        left_on='subsector_id1',
        right_on='subsector_id',
        how='left'
    )
    # Rename the new columns to avoid confusion in the next merge.
    merged_df = merged_df.rename(columns={
        'Correct Sector Matched': 'Correct Sector Matched_1',
        'Tags': 'Tags_1'
    })

    # Step 3: Merge again to get details for 'subsector_id2'.
    # This joins the result with the categories table again, but this time on 'subsector_id2'.
    final_merged_df = pd.merge(
        merged_df,
        subsector_categories_df,
        left_on='subsector_id2',
        right_on='subsector_id',
        how='left'
    )
    # Rename the columns from the second merge.
    final_merged_df = final_merged_df.rename(columns={
        'Correct Sector Matched': 'Correct Sector Matched_2',
        'Tags': 'Tags_2'
    })

    # Step 4: Clean up the DataFrame.
    # Remove the redundant 'subsector_id' columns that were added during the merges.
    final_merged_df = final_merged_df.drop(columns=['subsector_id_x', 'subsector_id_y'])

    # Step 5: Save the final merged DataFrame to a new Parquet file.
    output_filename = "job_similarity_matrix_with_tags.parquet"
    final_merged_df.to_parquet(output_filename)

    print(f"Successfully merged the data and saved it to '{output_filename}'")
    print("\nPreview of the final data:")
    print(final_merged_df.head())

except FileNotFoundError:
    print("Execution failed. Please ensure you have uploaded both 'Subsector_categories.csv' and 'job_similarity_matrix.parquet'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully merged the data and saved it to 'job_similarity_matrix_with_tags.parquet'

Preview of the final data:
                    Uniquecode1                       jobRole1  \
0  QG-04-AA-04192-2025-V2-AASSC  Airport Warehouse Coordinator   
1                     AAS/Q1001        Aerospace CNC Machinist   
2                     AAS/Q1001        Aerospace CNC Machinist   
3                     AAS/Q1001        Aerospace CNC Machinist   
4                     AAS/Q1001        Aerospace CNC Machinist   

                                        jobRoleDesc1  sector_id1  \
0  Airport Warehouse Coordinator is responsible f...           3   
1  This role primarily involves CNC machining of ...           3   
2  This role primarily involves CNC machining of ...           3   
3  This role primarily involves CNC machining of ...           3   
4  This role primarily involves CNC machining of ...           3   

             sector_name1  subsector_id1  \
0  Aerospace and Aviation          

Getting sector_pair_similarity

In [None]:
import pandas as pd

try:
    # Step 1: Load the Parquet file into a pandas DataFrame.
    df = pd.read_parquet('job_similarity_matrix_updated.parquet')

    # Step 2: Group the data by the pairs of sectors and their corresponding tags.
    # This will create a group for every unique combination of Sector_1, Tag_1, Sector_2, and Tag_2.
    grouped_data = df.groupby([
        'Correct Sector Matched_1',
        'Tags_1',
        'Correct Sector Matched_2',
        'Tags_2'
    ])

    # Step 3: Calculate the mean and median of 'Composite Similarity' for each group.
    # The .agg() function allows us to compute multiple statistics at once.
    # We use reset_index() to turn the grouped columns back into regular columns.
    sector_similarity_stats = grouped_data['Composite Similarity'].agg(['mean', 'median']).reset_index()

    # Step 4: Rename the columns to match your desired output format.
    sector_similarity_stats.columns = [
        'Correct Sector Matchedx',
        'Tagsx',
        'Correct Sector Matchedy',
        'Tagsy',
        'Mean Composite Similarity',
        'Median Composite Similarity'
    ]

    # Step 5: Save the final DataFrame to a CSV file.
    output_filename = 'sector_pair_similarity_1.csv'
    sector_similarity_stats.to_csv(output_filename, index=False)

    print(f"Successfully created the file: '{output_filename}'")
    print("\nHere's a preview of the resulting data:")
    print(sector_similarity_stats.head())

except FileNotFoundError:
    print("Error: 'job_similarity_matrix_with_tags.parquet' not found.")
    print("Please make sure the file is uploaded to your Colab environment.")
except KeyError as e:
    print(f"Error: A required column was not found in the file: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully created the file: 'sector_pair_similarity_1.csv'

Here's a preview of the resulting data:
  Correct Sector Matchedx              Tagsx  \
0           Accommodation  Declining Sectors   
1           Accommodation  Declining Sectors   
2           Accommodation  Declining Sectors   
3           Accommodation  Declining Sectors   
4           Accommodation  Declining Sectors   

                             Correct Sector Matchedy              Tagsy  \
0                                      Accommodation  Declining Sectors   
1  Activities auxiliary to financial service and ...  Expanding Sectors   
2  Activities of head offices; management consult...  Expanding Sectors   
3  Activities of households as employers of domes...  Declining Sectors   
4                    Advertising and market research  Expanding Sectors   

   Mean Composite Similarity  Median Composite Similarity  
0                  61.683298                    63.626001  
1                  54.589367         

In [None]:
import pandas as pd

def find_top_matches(group):
    """
    This function finds the top 3 matches for a given sector group in three categories:
    1. Overall: Any sector.
    2. Green Sectors: Only sectors tagged as 'Green Sectors'.
    3. Expanding Sectors: Only sectors tagged as 'Expanding Sectors'.
    """
    # Filter out cases where a sector is matched with itself.
    group = group[group['Correct Sector Matchedx'] != group['Correct Sector Matchedy']]

    # Find the top 3 matches based on the highest Mean Composite Similarity.
    overall_top_3 = group.nlargest(3, 'Mean Composite Similarity').copy()
    overall_top_3['Match Category'] = 'Overall'

    green_top_3 = group[group['Tagsy'] == 'Green Sectors'].nlargest(3, 'Mean Composite Similarity').copy()
    green_top_3['Match Category'] = 'Green Sectors'

    expanding_top_3 = group[group['Tagsy'] == 'Expanding Sectors'].nlargest(3, 'Mean Composite Similarity').copy()
    expanding_top_3['Match Category'] = 'Expanding Sectors'

    # Combine the results from all categories.
    return pd.concat([overall_top_3, green_top_3, expanding_top_3])

try:
    # Step 1: Load the sector similarity data.
    df = pd.read_csv('sector_pair_similarity.csv')

    # Step 2: Define the source categories you want to analyze.
    source_categories = ['Declining Sectors', 'Declining Sectors with net zero transition']
    source_df = df[df['Tagsx'].isin(source_categories)].copy()

    # Step 3: Group by the source sector and apply the matching function.
    # This will run the `find_top_matches` logic for each unique source sector.
    analysis_results = source_df.groupby('Correct Sector Matchedx').apply(find_top_matches).reset_index(drop=True)

    # Step 4: Add a 'Rank' column for the top matches within each category.
    analysis_results['Rank'] = analysis_results.groupby(['Correct Sector Matchedx', 'Match Category']).cumcount() + 1

    # Step 5: Select and rename the columns for the final report.
    final_output = analysis_results[[
        'Tagsx',
        'Correct Sector Matchedx',
        'Match Category',
        'Rank',
        'Correct Sector Matchedy',
        'Tagsy',
        'Mean Composite Similarity'
    ]].rename(columns={
        'Tagsx': 'Source Sector Category',
        'Correct Sector Matchedx': 'Source Sector Name',
        'Correct Sector Matchedy': 'Matched Sector Name',
        'Tagsy': 'Matched Sector Tag'
    })

    # Step 6: Save the final analysis to a new CSV file.
    output_filename = 'Analysis2.csv'
    final_output.to_csv(output_filename, index=False)

    print(f"Successfully created the analysis file: '{output_filename}'")
    print("\nHere's a preview of the resulting data:")
    print(final_output.head())

except FileNotFoundError:
    print("Error: 'sector_pair_similarity.csv' not found.")
    print("Please make sure the file is uploaded to your Colab environment.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully created the analysis file: 'Analysis2.csv'

Here's a preview of the resulting data:
  Source Sector Category Source Sector Name Match Category  Rank  \
0      Declining Sectors      Accommodation        Overall     1   
1      Declining Sectors      Accommodation        Overall     2   
2      Declining Sectors      Accommodation        Overall     3   
3      Declining Sectors      Accommodation  Green Sectors     1   
4      Declining Sectors      Accommodation  Green Sectors     2   

                                 Matched Sector Name  \
0               Food and beverage service activities   
1  Travel agency, tour operator, reservation serv...   
2  Insurance, reinsurance and pension funding, ex...   
3                                                 PV   
4                                           Hydrogen   

                           Matched Sector Tag  Mean Composite Similarity  
0                           Declining Sectors                  62.520766  
1  Decl

  analysis_results = source_df.groupby('Correct Sector Matchedx').apply(find_top_matches).reset_index(drop=True)


In [None]:
import pandas as pd

def find_top_matches(group):
    """
    This function finds the top 3 matches for a given sector group in three categories:
    1. Overall: Any sector.
    2. Green Sectors: Only sectors tagged as 'Green Sectors'.
    3. Expanding Sectors: Only sectors tagged as 'Expanding Sectors'.

    The top matches are determined by the highest 'Mean Composite Similarity'.
    """
    # Filter out cases where a sector is matched with itself.
    group = group[group['Correct Sector Matchedx'] != group['Correct Sector Matchedy']]

    # Find the top 3 matches based on the highest Mean Composite Similarity.
    overall_top_3 = group.nlargest(3, 'Mean Composite Similarity').copy()
    overall_top_3['Match Category'] = 'Overall'

    green_top_3 = group[group['Tagsy'] == 'Green Sectors'].nlargest(3, 'Mean Composite Similarity').copy()
    green_top_3['Match Category'] = 'Green Sectors'

    expanding_top_3 = group[group['Tagsy'] == 'Expanding Sectors'].nlargest(3, 'Mean Composite Similarity').copy()
    expanding_top_3['Match Category'] = 'Expanding Sectors'

    # Combine the results from all categories.
    return pd.concat([overall_top_3, green_top_3, expanding_top_3])

try:
    # Step 1: Load the sector similarity data.
    # This file should now contain both 'Mean Composite Similarity' and 'Median Composite Similarity' columns.
    df = pd.read_csv('sector_pair_similarity_1.csv')

    # Step 2: Define the source categories you want to analyze.
    source_categories = ['Declining Sectors', 'Declining Sectors with net zero transition']
    source_df = df[df['Tagsx'].isin(source_categories)].copy()

    # Step 3: Group by the source sector and apply the matching function.
    # This will run the `find_top_matches` logic for each unique source sector.
    analysis_results = source_df.groupby('Correct Sector Matchedx').apply(find_top_matches).reset_index(drop=True)

    # Step 4: Add a 'Rank' column for the top matches within each category.
    analysis_results['Rank'] = analysis_results.groupby(['Correct Sector Matchedx', 'Match Category']).cumcount() + 1

    # Step 5: Select and rename the columns for the final report.
    # We now include 'Median Composite Similarity' in the final output.
    final_output = analysis_results[[
        'Tagsx',
        'Correct Sector Matchedx',
        'Match Category',
        'Rank',
        'Correct Sector Matchedy',
        'Tagsy',
        'Mean Composite Similarity',
        'Median Composite Similarity' # Added Median column
    ]].rename(columns={
        'Tagsx': 'Source Sector Category',
        'Correct Sector Matchedx': 'Source Sector Name',
        'Correct Sector Matchedy': 'Matched Sector Name',
        'Tagsy': 'Matched Sector Tag'
    })

    # Step 6: Save the final analysis to a new CSV file.
    output_filename = 'Analysis2_with_median_mean.csv'
    final_output.to_csv(output_filename, index=False)

    print(f"Successfully created the analysis file: '{output_filename}'")
    print("\nHere's a preview of the resulting data:")
    print(final_output.head())

except FileNotFoundError:
    print("Error: 'sector_pair_similarity.csv' not found.")
    print("Please make sure the file is uploaded to your Colab environment.")
except KeyError as e:
    if 'Median Composite Similarity' in str(e):
        print("Error: The column 'Median Composite Similarity' was not found in 'sector_pair_similarity.csv'.")
        print("Please ensure your input CSV file contains this column.")
    else:
        print(f"An unexpected KeyError occurred: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Successfully created the analysis file: 'Analysis2_with_median_mean.csv'

Here's a preview of the resulting data:
  Source Sector Category Source Sector Name Match Category  Rank  \
0      Declining Sectors      Accommodation        Overall     1   
1      Declining Sectors      Accommodation        Overall     2   
2      Declining Sectors      Accommodation        Overall     3   
3      Declining Sectors      Accommodation  Green Sectors     1   
4      Declining Sectors      Accommodation  Green Sectors     2   

                                 Matched Sector Name  \
0               Food and beverage service activities   
1  Travel agency, tour operator, reservation serv...   
2  Insurance, reinsurance and pension funding, ex...   
3                                           Hydrogen   
4                                                 PV   

                           Matched Sector Tag  Mean Composite Similarity  \
0                           Declining Sectors                  6

  analysis_results = source_df.groupby('Correct Sector Matchedx').apply(find_top_matches).reset_index(drop=True)


3rd part of the Analysis

In [None]:
import pandas as pd

try:
    # --- Step 1: Load all three data files ---
    print("Step 1: Loading all three source files...")
    sim_df = pd.read_parquet('job_similarity_matrix_updated.parquet')
    nco_csv_df = pd.read_csv('nco-clean.csv')
    jobs_db_df = pd.read_parquet('Jobs and skills database 15Sept25.parquet')

    # --- Step 2: Prepare the two separate lookup tables ---
    print("Step 2: Preparing NCO code lookup tables...")

    # Lookup Table 1 (Primary Source from CSV)
    # Using 'jobRole' and 'nco_clean'
    lookup_csv = nco_csv_df[['jobRole', 'nco_clean']].copy()
    lookup_csv.dropna(inplace=True)
    lookup_csv['nco_clean'] = lookup_csv['nco_clean'].astype(str)
    lookup_csv.drop_duplicates(subset=['jobRole'], keep='first', inplace=True)

    # Lookup Table 2 (Secondary Source from Parquet Database)
    # Using 'Title' and cleaning 'nco_code_full'
    lookup_parquet = jobs_db_df[['Title', 'nco_code_full']].copy()
    lookup_parquet.rename(columns={'Title': 'jobRole', 'nco_code_full': 'nco_clean'}, inplace=True)
    lookup_parquet.dropna(inplace=True)
    lookup_parquet['nco_clean'] = lookup_parquet['nco_clean'].str.strip('[]').astype(str)
    lookup_parquet.drop_duplicates(subset=['jobRole'], keep='first', inplace=True)

    # --- Step 3: First Pass - Merge with the CSV data ---
    print("Step 3: Matching NCO codes using 'nco-clean.csv' as the primary source...")

    # Merge for jobRole1
    merged_df = pd.merge(sim_df, lookup_csv, left_on='jobRole1', right_on='jobRole', how='left')
    merged_df.rename(columns={'nco_clean': 'nco_clean1'}, inplace=True)
    merged_df.drop(columns=['jobRole'], inplace=True)

    # Merge for jobRole2
    merged_df = pd.merge(merged_df, lookup_csv, left_on='jobRole2', right_on='jobRole', how='left')
    merged_df.rename(columns={'nco_clean': 'nco_clean2'}, inplace=True)
    merged_df.drop(columns=['jobRole'], inplace=True)

    # --- Step 4: Second Pass - Fill missing values using the Parquet database ---
    print("Step 4: Filling unmatched rows using the 'Jobs and skills database' as a secondary source...")

    # Create a mapping from the secondary source for efficient filling
    parquet_map = lookup_parquet.set_index('jobRole')['nco_clean']

    # Fill any remaining empty values in 'nco_clean1'
    merged_df['nco_clean1'] = merged_df['nco_clean1'].fillna(merged_df['jobRole1'].map(parquet_map))

    # Fill any remaining empty values in 'nco_clean2'
    merged_df['nco_clean2'] = merged_df['nco_clean2'].fillna(merged_df['jobRole2'].map(parquet_map))

    # --- Step 5: Report on unmatched jobs and save the file ---
    unmatched_1_count = merged_df['nco_clean1'].isnull().sum()
    unmatched_2_count = merged_df['nco_clean2'].isnull().sum()

    print("\n--- Match Report ---")
    print(f"Unmatched NCO codes for jobRole1: {unmatched_1_count} out of {len(merged_df)}")
    print(f"Unmatched NCO codes for jobRole2: {unmatched_2_count} out of {len(merged_df)}")
    print("--------------------")

    output_filename = 'job_similarity_matrix_with_nco.parquet'
    merged_df.to_parquet(output_filename, index=False)

    print(f"\n✅ Successfully updated and saved data to '{output_filename}'")
    print("\nHere's a preview of the final data with the new NCO columns:")
    print(merged_df[['jobRole1', 'nco_clean1', 'jobRole2', 'nco_clean2']].head())

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading all three source files...
Step 2: Preparing NCO code lookup tables...
Step 3: Matching NCO codes using 'nco-clean.csv' as the primary source...
Step 4: Filling unmatched rows using the 'Jobs and skills database' as a secondary source...

--- Match Report ---
Unmatched NCO codes for jobRole1: 484806 out of 3352428
Unmatched NCO codes for jobRole2: 421266 out of 3352428
--------------------

✅ Successfully updated and saved data to 'job_similarity_matrix_with_nco.parquet'

Here's a preview of the final data with the new NCO columns:
                        jobRole1 nco_clean1  \
0  Airport Warehouse Coordinator        nan   
1        Aerospace CNC Machinist     7223.5   
2        Aerospace CNC Machinist     7223.5   
3        Aerospace CNC Machinist     7223.5   
4        Aerospace CNC Machinist     7223.5   

                                         jobRole2 nco_clean2  
0    Advertising Operations Coordinator (Digital)    1222.01  
1                                    M

In [None]:
import pandas as pd

try:
    # --- Step 1: Load all necessary data files ---
    print("Step 1: Loading data files...")
    sim_df = pd.read_parquet('job_similarity_matrix_with_nco.parquet')
    nco_csv_df = pd.read_csv('nco-clean.csv')
    jobs_db_df = pd.read_parquet('Jobs and skills database 15Sept25.parquet')

    # --- Step 2: Initial Report on Unmatched Declining Sector Jobs ---
    print("\n--- Initial Report (Before Second Pass) ---")
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Count initially unmatched jobs where Tags_1 is a declining category
    initial_unmatched_1 = sim_df[
        (sim_df['Tags_1'].isin(declining_tags)) &
        (sim_df['nco_clean1'].isnull())
    ].shape[0]

    # Count initially unmatched jobs where Tags_2 is a declining category
    initial_unmatched_2 = sim_df[
        (sim_df['Tags_2'].isin(declining_tags)) &
        (sim_df['nco_clean2'].isnull())
    ].shape[0]

    total_initial_unmatched = initial_unmatched_1 + initial_unmatched_2
    print(f"Initially, there are {total_initial_unmatched} job instances in declining sectors with missing NCO codes.")
    print("---------------------------------------------")

    # --- Step 3: Create a new, consolidated lookup table based on Uniquecode ---
    print("\nStep 3: Creating a new lookup table from 'qpCode' and 'NQR code'...")

    # Source 1: qpCode from nco-clean.csv
    lookup1 = nco_csv_df[['qpCode', 'nco_clean']].copy()
    lookup1.rename(columns={'qpCode': 'code_key'}, inplace=True)

    # Source 2: NQR code from Jobs and skills database
    lookup2 = jobs_db_df[['NQR code', 'nco_code_full']].copy()
    lookup2.rename(columns={'NQR code': 'code_key', 'nco_code_full': 'nco_clean'}, inplace=True)
    lookup2['nco_clean'] = lookup2['nco_clean'].str.strip('[]').astype(str)

    # Combine, clean, and deduplicate to create a single lookup source
    uniquecode_lookup = pd.concat([lookup1, lookup2], ignore_index=True)
    uniquecode_lookup.dropna(subset=['code_key', 'nco_clean'], inplace=True)
    uniquecode_lookup.drop_duplicates(subset=['code_key'], keep='first', inplace=True)

    print(f"Created a consolidated lookup table with {len(uniquecode_lookup)} unique code keys.")

    # --- Step 4: Perform the second matching pass to fill missing NCO codes ---
    print("\nStep 4: Filling missing NCO codes using the new lookup table...")

    # Create a mapping for efficient filling
    uniquecode_map = uniquecode_lookup.set_index('code_key')['nco_clean']

    # Fill NaNs for nco_clean1 using Uniquecode1
    sim_df['nco_clean1'] = sim_df['nco_clean1'].fillna(sim_df['Uniquecode1'].map(uniquecode_map))

    # Fill NaNs for nco_clean2 using Uniquecode2
    sim_df['nco_clean2'] = sim_df['nco_clean2'].fillna(sim_df['Uniquecode2'].map(uniquecode_map))

    # --- Step 5: Final Report and Save ---
    print("\n--- Final Report (After Second Pass) ---")

    # Total number of jobs still left unmatched
    final_unmatched_1_count = sim_df['nco_clean1'].isnull().sum()
    final_unmatched_2_count = sim_df['nco_clean2'].isnull().sum()
    print(f"Total jobs still missing nco_clean1: {final_unmatched_1_count} out of {len(sim_df)}")
    print(f"Total jobs still missing nco_clean2: {final_unmatched_2_count} out of {len(sim_df)}")

    # Of those still left, how many are in declining sectors
    final_declining_unmatched_1 = sim_df[
        (sim_df['Tags_1'].isin(declining_tags)) &
        (sim_df['nco_clean1'].isnull())
    ].shape[0]

    final_declining_unmatched_2 = sim_df[
        (sim_df['Tags_2'].isin(declining_tags)) &
        (sim_df['nco_clean2'].isnull())
    ].shape[0]

    total_final_declining_unmatched = final_declining_unmatched_1 + final_declining_unmatched_2
    print(f"\nOf the remaining unmatched jobs, {total_final_declining_unmatched} instances belong to declining sectors.")
    print("----------------------------------------")

    # Save the final updated file
    output_filename = 'job_similarity_matrix_with_nco_v2.parquet'
    sim_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Successfully completed the second pass and saved the final data to '{output_filename}'")


except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading data files...

--- Initial Report (Before Second Pass) ---
Initially, there are 336546 job instances in declining sectors with missing NCO codes.
---------------------------------------------

Step 3: Creating a new lookup table from 'qpCode' and 'NQR code'...
Created a consolidated lookup table with 3609 unique code keys.

Step 4: Filling missing NCO codes using the new lookup table...

--- Final Report (After Second Pass) ---
Total jobs still missing nco_clean1: 122696 out of 3352428
Total jobs still missing nco_clean2: 213870 out of 3352428

Of the remaining unmatched jobs, 108734 instances belong to declining sectors.
----------------------------------------

✅ Successfully completed the second pass and saved the final data to 'job_similarity_matrix_with_nco_v2.parquet'


In [None]:
import pandas as pd

try:
    # --- Step 1: Load the most recently updated data file ---
    print("Step 1: Loading the final updated Parquet file...")
    final_df = pd.read_parquet('job_similarity_matrix_with_nco_v2.parquet')

    # --- Step 2: Identify all job instances that are in declining sectors AND still have no NCO code ---
    print("Step 2: Filtering for unmatched jobs in declining sectors...")
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Find unmatched jobs from the 'jobRole1' columns
    condition1 = (final_df['Tags_1'].isin(declining_tags)) & (final_df['nco_clean1'].isnull())
    unmatched_jobs1 = final_df.loc[condition1, ['Uniquecode1', 'jobRole1', 'Tags_1']]
    unmatched_jobs1.columns = ['Uniquecode', 'JobRole', 'Tag']

    # Find unmatched jobs from the 'jobRole2' columns
    condition2 = (final_df['Tags_2'].isin(declining_tags)) & (final_df['nco_clean2'].isnull())
    unmatched_jobs2 = final_df.loc[condition2, ['Uniquecode2', 'jobRole2', 'Tags_2']]
    unmatched_jobs2.columns = ['Uniquecode', 'JobRole', 'Tag']

    # --- Step 3: Combine and find the unique set of jobs ---
    print("Step 3: Combining and deduplicating the list of jobs...")

    # Combine both lists
    all_unmatched_df = pd.concat([unmatched_jobs1, unmatched_jobs2], ignore_index=True)

    # Drop duplicates to get a list of unique jobs
    unique_unmatched_jobs_df = all_unmatched_df.drop_duplicates().sort_values(by='JobRole')

    # --- Step 4: Save the result to a CSV file ---
    output_filename = 'unmatched_declining_jobs.csv'
    unique_unmatched_jobs_df.to_csv(output_filename, index=False)

    print(f"\n✅ Found {len(unique_unmatched_jobs_df)} unique jobs in declining sectors that are still unmatched.")
    print(f"A CSV file with this list has been saved as '{output_filename}'")

except FileNotFoundError:
    print("❌ Error: The file 'job_similarity_matrix_with_nco_v2.parquet' was not found.")
    print("Please ensure the previous step was run successfully.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading the final updated Parquet file...
Step 2: Filtering for unmatched jobs in declining sectors...
Step 3: Combining and deduplicating the list of jobs...

✅ Found 40 unique jobs in declining sectors that are still unmatched.
A CSV file with this list has been saved as 'unmatched_declining_jobs.csv'


In [None]:
import pandas as pd

try:
    # --- Step 1: Load the data files ---
    print("Step 1: Loading the data files...")
    sim_df = pd.read_parquet('job_similarity_matrix_with_nco_v2.parquet')
    prominent_df = pd.read_csv('prominent occupations.csv')

    # --- Step 2: Extract the first digit from the NCO codes (Corrected Logic) ---
    print("Step 2: Extracting the very first digit from NCO codes...")

    # For nco_clean1: .str[0] gets the first character of the string.
    sim_df['nco_first1'] = pd.to_numeric(
        sim_df['nco_clean1'].astype(str).str[0],
        errors='coerce'
    ).astype('Int64')

    # For nco_clean2: .str[0] gets the first character of the string.
    sim_df['nco_first2'] = pd.to_numeric(
        sim_df['nco_clean2'].astype(str).str[0],
        errors='coerce'
    ).astype('Int64')

    # --- Step 3: Prepare the prominent occupations data for matching ---
    print("Step 3: Preparing the prominent occupations lookup table...")
    prominent_df['nco_first'] = prominent_df['nco_first'].astype('Int64')
    prominent_pairs = set(zip(prominent_df['sector'], prominent_df['nco_first']))

    # --- Step 4: Check for prominence and create the new columns ---
    print("Step 4: Matching jobs to determine prominence...")

    def check_prominence_1(row):
        if pd.isna(row['nco_first1']):
            return 0
        return 1 if (row['Correct Sector Matched_1'], row['nco_first1']) in prominent_pairs else 0

    sim_df['prominent_or_not_1'] = sim_df.apply(check_prominence_1, axis=1)

    def check_prominence_2(row):
        if pd.isna(row['nco_first2']):
            return 0
        return 1 if (row['Correct Sector Matched_2'], row['nco_first2']) in prominent_pairs else 0

    sim_df['prominent_or_not_2'] = sim_df.apply(check_prominence_2, axis=1)

    # Clean up the temporary helper columns
    sim_df.drop(columns=['nco_first1', 'nco_first2'], inplace=True)

    # --- Step 5: Save the final updated DataFrame ---
    output_filename = 'job_similarity_with_prominence.parquet'
    sim_df.to_parquet(output_filename, index=False)

    print(f"\n✅ Successfully updated data and saved to '{output_filename}'")

    print("\nHere's a preview of the final data with the new 'prominent_or_not' columns:")
    preview_cols = [
        'Correct Sector Matched_1', 'nco_clean1', 'prominent_or_not_1',
        'Correct Sector Matched_2', 'nco_clean2', 'prominent_or_not_2'
    ]
    print(sim_df[preview_cols].head())

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading the data files...
Step 2: Extracting the very first digit from NCO codes...
Step 3: Preparing the prominent occupations lookup table...
Step 4: Matching jobs to determine prominence...

✅ Successfully updated data and saved to 'job_similarity_with_prominence.parquet'

Here's a preview of the final data with the new 'prominent_or_not' columns:
  Correct Sector Matched_1 nco_clean1  prominent_or_not_1  \
0            Air transport        nan                   0   
1      Other manufacturing     7223.5                   0   
2      Other manufacturing     7223.5                   0   
3      Other manufacturing     7223.5                   0   
4      Other manufacturing     7223.5                   0   

                            Correct Sector Matched_2 nco_clean2  \
0            Programming and broadcasting activities    1222.01   
1                                Other manufacturing       None   
2  Sports activities and amusement and recreation...    3423.02   
3  O

In [None]:
import pandas as pd

try:
    # --- Step 1: Load the dataset ---
    print("Step 1: Loading the dataset...")
    df = pd.read_parquet('job_similarity_with_prominence.parquet')

    # --- Step 2: Normalize the data to isolate prominent declining jobs as the 'source' ---
    print("Step 2: Isolating prominent jobs from declining sectors...")
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Part 1: Cases where the prominent declining job is in the '..._1' columns
    source1 = df[
        (df['Tags_1'].isin(declining_tags)) &
        (df['prominent_or_not_1'] == 1)
    ].copy()
    source1_renamed = source1[['jobRole1', 'Correct Sector Matched_1', 'Tags_1', 'Tags_2', 'Composite Similarity']]
    source1_renamed.columns = ['Source JobRole', 'Source Sector', 'Source Tag', 'Target Tag', 'Similarity']

    # Part 2: Cases where the prominent declining job is in the '..._2' columns
    source2 = df[
        (df['Tags_2'].isin(declining_tags)) &
        (df['prominent_or_not_2'] == 1)
    ].copy()
    source2_renamed = source2[['jobRole2', 'Correct Sector Matched_2', 'Tags_2', 'Tags_1', 'Composite Similarity']]
    source2_renamed.columns = ['Source JobRole', 'Source Sector', 'Source Tag', 'Target Tag', 'Similarity']

    # Combine into a single analysis DataFrame
    analysis_df = pd.concat([source1_renamed, source2_renamed], ignore_index=True)
    print(f"Created a normalized dataset with {len(analysis_df)} comparisons to analyze.")

    # --- Step 3: Calculate Statistics ---
    print("Step 3: Calculating mean and median similarity scores...")

    # Group by the source job to perform calculations
    grouped = analysis_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])

    # Overall stats
    overall_stats = grouped['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Overall Mean', 'median': 'Overall Median'})

    # Green Sector stats
    green_df = analysis_df[analysis_df['Target Tag'] == 'Green Sectors']
    green_stats = green_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Green Sector Mean', 'median': 'Green Sector Median'})

    # Expanding Sector stats
    expanding_df = analysis_df[analysis_df['Target Tag'] == 'Expanding Sectors']
    expanding_stats = expanding_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Expanding Sector Mean', 'median': 'Expanding Sector Median'})

    # --- Step 4: Combine results and add job counts ---
    print("Step 4: Combining results into the final report...")

    # Join all the stats together
    final_report = overall_stats.join(green_stats, how='left').join(expanding_stats, how='left')

    # Calculate the number of unique prominent jobs for each declining tag
    prominent_job_counts = analysis_df.groupby('Source Tag')['Source JobRole'].nunique().to_dict()
    final_report['Unique Prominent Jobs in Category'] = final_report.index.get_level_values('Source Tag').map(prominent_job_counts)

    # Reset index to turn multi-index into columns
    final_report.reset_index(inplace=True)

    # Fill any NaN values (e.g., if a job had no comparisons to Green sectors) with 0
    final_report.fillna(0, inplace=True)


    # --- Step 5: Save to CSV ---
    output_filename = 'prominent_declining_similarity_analysis.csv'
    final_report.to_csv(output_filename, index=False)

    print(f"\n✅ Analysis complete. Final report saved to '{output_filename}'")
    print("\nHere's a preview of the final data:")
    print(final_report.head())


except FileNotFoundError as e:
    print(f"❌ Error: The required Parquet file was not found.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading the dataset...
Step 2: Isolating prominent jobs from declining sectors...
Created a normalized dataset with 209699 comparisons to analyze.
Step 3: Calculating mean and median similarity scores...
Step 4: Combining results into the final report...

✅ Analysis complete. Final report saved to 'prominent_declining_similarity_analysis.csv'

Here's a preview of the final data:
             Source JobRole                                Source Sector  \
0            AR VR Producer  Creative, arts and entertainment activities   
1           AR-VR Developer  Creative, arts and entertainment activities   
2   Airline Baggage Handler                                Air transport   
3  Animation Associate - 3D  Creative, arts and entertainment activities   
4        Animation Director  Creative, arts and entertainment activities   

                                   Source Tag  Overall Mean  Overall Median  \
0  Declining Sectors with net zero transition     41.786175       39.58292

In [None]:
import pandas as pd

try:
    # --- Step 1: Load the dataset ---
    print("Step 1: Loading the dataset...")
    df = pd.read_parquet('job_similarity_with_prominence.parquet')

    # --- Step 2: Normalize data, now including target job details ---
    print("Step 2: Isolating prominent jobs from declining sectors...")
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Part 1: Prominent declining job is in the '..._1' columns
    source1 = df[
        (df['Tags_1'].isin(declining_tags)) &
        (df['prominent_or_not_1'] == 1)
    ].copy()
    cols1 = ['jobRole1', 'Correct Sector Matched_1', 'Tags_1', 'Tags_2', 'Composite Similarity', 'Uniquecode2', 'jobRole2', 'Correct Sector Matched_2']
    source1_renamed = source1[cols1]
    source1_renamed.columns = ['Source JobRole', 'Source Sector', 'Source Tag', 'Target Tag', 'Similarity', 'Target Uniquecode', 'Target JobRole', 'Target Sector']

    # Part 2: Prominent declining job is in the '..._2' columns
    source2 = df[
        (df['Tags_2'].isin(declining_tags)) &
        (df['prominent_or_not_2'] == 1)
    ].copy()
    cols2 = ['jobRole2', 'Correct Sector Matched_2', 'Tags_2', 'Tags_1', 'Composite Similarity', 'Uniquecode1', 'jobRole1', 'Correct Sector Matched_1']
    source2_renamed = source2[cols2]
    source2_renamed.columns = ['Source JobRole', 'Source Sector', 'Source Tag', 'Target Tag', 'Similarity', 'Target Uniquecode', 'Target JobRole', 'Target Sector']

    analysis_df = pd.concat([source1_renamed, source2_renamed], ignore_index=True)
    print(f"Created a normalized dataset with {len(analysis_df)} comparisons to analyze.")

    # --- Step 3: Calculate Statistics ---
    print("Step 3: Calculating mean and median similarity scores...")

    grouped = analysis_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])
    overall_stats = grouped['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Overall Mean', 'median': 'Overall Median'})

    green_df = analysis_df[analysis_df['Target Tag'] == 'Green Sectors']
    green_stats = green_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Green Sector Mean', 'median': 'Green Sector Median'})

    expanding_df = analysis_df[analysis_df['Target Tag'] == 'Expanding Sectors']
    expanding_stats = expanding_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])['Similarity'].agg(['mean', 'median']).rename(columns={'mean': 'Expanding Sector Mean', 'median': 'Expanding Sector Median'})

    # --- Step 4: Find the single closest job in the Expanding category ---
    print("Step 4: Identifying the closest matching job from the Expanding Sector...")

    # Find the index of the row with the max similarity for each source job
    idx = expanding_df.groupby(['Source JobRole', 'Source Sector', 'Source Tag'])['Similarity'].idxmax()
    closest_jobs = expanding_df.loc[idx]

    # Format the details into a single string
    closest_jobs['closest_job_expanding'] = (
        closest_jobs['Target JobRole'] + " (" +
        closest_jobs['Target Uniquecode'] + ") | Sector: " +
        closest_jobs['Target Sector']
    )

    # Keep only the source job info and the new formatted string for joining
    closest_job_details = closest_jobs[['Source JobRole', 'Source Sector', 'Source Tag', 'closest_job_expanding']]
    closest_job_details.set_index(['Source JobRole', 'Source Sector', 'Source Tag'], inplace=True)


    # --- Step 5: Combine all results and add job counts ---
    print("Step 5: Combining all results into the final report...")

    # Join stats and the closest job details
    final_report = overall_stats.join(green_stats, how='left') \
                                .join(expanding_stats, how='left') \
                                .join(closest_job_details, how='left')

    prominent_job_counts = analysis_df.groupby('Source Tag')['Source JobRole'].nunique().to_dict()
    final_report['Unique Prominent Jobs in Category'] = final_report.index.get_level_values('Source Tag').map(prominent_job_counts)

    final_report.reset_index(inplace=True)

    # Fill any NaN values with 0 for numeric columns and 'N/A' for the text column
    for col in final_report.columns:
        if pd.api.types.is_numeric_dtype(final_report[col]):
            final_report[col].fillna(0, inplace=True)
    final_report['closest_job_expanding'].fillna('N/A', inplace=True)


    # --- Step 6: Save to CSV ---
    output_filename = 'prominent_declining_similarity_analysis.csv'
    final_report.to_csv(output_filename, index=False)

    print(f"\n✅ Analysis complete. Final report saved to '{output_filename}'")
    print("\nHere's a preview of the final data:")
    print(final_report.head())


except FileNotFoundError as e:
    print(f"❌ Error: The required Parquet file was not found.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading the dataset...
Step 2: Isolating prominent jobs from declining sectors...
Created a normalized dataset with 209699 comparisons to analyze.
Step 3: Calculating mean and median similarity scores...
Step 4: Identifying the closest matching job from the Expanding Sector...
Step 5: Combining all results into the final report...

✅ Analysis complete. Final report saved to 'prominent_declining_similarity_analysis.csv'

Here's a preview of the final data:
             Source JobRole                                Source Sector  \
0            AR VR Producer  Creative, arts and entertainment activities   
1           AR-VR Developer  Creative, arts and entertainment activities   
2   Airline Baggage Handler                                Air transport   
3  Animation Associate - 3D  Creative, arts and entertainment activities   
4        Animation Director  Creative, arts and entertainment activities   

                                   Source Tag  Overall Mean  Overall Median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_report[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_report['closest_job_expanding'].fillna('N/A', inplace=True)


4th part of the Analysis

In [None]:
import pandas as pd

try:
    # Step 1: Load the Parquet file into a DataFrame.
    df = pd.read_parquet('job_similarity_matrix_updated.parquet')

    # Step 2: Identify the declining sector tags.
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Step 3: Extract jobs from declining sectors, checking both sides of each pair.

    # Get jobs where the first job ('jobRole1') is in a declining sector.
    part1 = df[df['Tags_1'].isin(declining_tags)][['jobRole1', 'subsector_name1', 'Tags_1', 'Composite Similarity']]
    part1.columns = ['Job Role', 'Subsector', 'Tag', 'Composite Similarity']

    # # Get jobs where the second job ('jobRole2') is in a declining sector.
    # part2 = df[df['Tags_2'].isin(declining_tags)][['jobRole2', 'subsector_name2', 'Tags_2', 'Composite Similarity']]
    # part2.columns = ['Job Role', 'Subsector', 'Tag', 'Composite Similarity']

    # Step 4: Combine both parts into a single DataFrame.
    # This creates a comprehensive list of every comparison involving a declining sector job.
    all_declining_jobs_df = part1

    # Step 5: Calculate the median composite similarity for each unique job.
    # We group by the job role to aggregate all its comparison scores into a single median value.
    job_transition_scores = all_declining_jobs_df.groupby(
        ['Job Role', 'Subsector', 'Tag']
    )['Composite Similarity'].median().reset_index()

    # Step 6: Sort the results to rank jobs.
    # Jobs with higher median similarity are considered easier to transition from.
    job_transition_scores = job_transition_scores.sort_values(
        by='Composite Similarity', ascending=False
    )

    # Rename the similarity column for clarity in the final output.
    job_transition_scores = job_transition_scores.rename(
        columns={'Composite Similarity': 'Median Composite Similarity'}
    )

    # Step 7: Save the final ranked list to a CSV file.
    output_filename = 'declining_job_transition_scores.csv'
    job_transition_scores.to_csv(output_filename, index=False)

    print(f"✅ Successfully created the analysis file: '{output_filename}'")
    print("\nHere's a preview of the jobs ranked from easiest to hardest to transition:")
    print(job_transition_scores.head())

except FileNotFoundError:
    print("❌ Error: 'job_similarity_matrix_with_tags.parquet' not found.")
    print("Please make sure the file is uploaded to your Colab environment.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

✅ Successfully created the analysis file: 'declining_job_transition_scores.csv'

Here's a preview of the jobs ranked from easiest to hardest to transition:
                                              Job Role  \
781  Telecom Customer Care Executive - Call Center/...   
271         Cutting and Threading Operator (Metalware)   
108                        Automotive Accessory Fitter   
130                     Automotive Machining Assistant   
513                      Land Transportation Associate   

                      Subsector                Tag  \
781            Service Provider  Declining Sectors   
271                   Metalware  Declining Sectors   
108  Automotive Vehicle Service  Declining Sectors   
130  Automotive Vehicle Service  Declining Sectors   
513         Land Transportation  Declining Sectors   

     Median Composite Similarity  
781                    58.995535  
271                    58.414871  
108                    58.242354  
130                    58.1912

In [None]:
import pandas as pd

try:
    # Step 1: Load the Parquet file into a DataFrame.
    df = pd.read_parquet('job_similarity_matrix_with_tags.parquet')

    # Step 2: Identify the declining sector tags.
    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Step 3: Extract jobs and their details from declining sectors, checking both sides of each pair.

    # Define the columns we want to keep for each job.
    columns_to_keep_1 = [
        'Uniquecode1', 'jobRole1', 'sector_id1', 'sector_name1', 'subsector_id1', 'subsector_name1',
        'Correct Sector Matched_1', 'Tags_1', 'Composite Similarity'
    ]
    columns_to_keep_2 = [
        'Uniquecode2', 'jobRole2', 'sector_id2', 'sector_name2', 'subsector_id2', 'subsector_name2',
        'Correct Sector Matched_2', 'Tags_2', 'Composite Similarity'
    ]

    # Create a standard set of column names for easy concatenation.
    generic_columns = [
        'Unique Code', 'Job Role', 'Sector ID', 'Sector Name', 'Subsector ID', 'Subsector Name',
        'Correct Sector Matched', 'Tag', 'Composite Similarity'
    ]

    # Get jobs where the first job ('jobRole1') is in a declining sector.
    part1 = df[df['Tags_1'].isin(declining_tags)][columns_to_keep_1]
    part1.columns = generic_columns

    # Get jobs where the second job ('jobRole2') is in a declining sector.
    part2 = df[df['Tags_2'].isin(declining_tags)][columns_to_keep_2]
    part2.columns = generic_columns

    # Step 4: Combine both parts into a single DataFrame.
    all_declining_jobs_df = pd.concat([part1, part2], ignore_index=True)

    # Step 5: Calculate the median composite similarity for each unique job.
    # Group by all the identifying columns to keep them in the final output.
    job_transition_scores = all_declining_jobs_df.groupby([
        'Unique Code', 'Job Role', 'Sector ID', 'Sector Name', 'Subsector ID', 'Subsector Name',
        'Correct Sector Matched', 'Tag'
    ])['Composite Similarity'].median().reset_index()

    # Step 6: Sort the results to rank jobs from easiest to hardest to transition.
    job_transition_scores = job_transition_scores.sort_values(
        by='Composite Similarity', ascending=False
    )

    # Rename the similarity column for clarity.
    job_transition_scores = job_transition_scores.rename(
        columns={'Composite Similarity': 'Median Composite Similarity'}
    )

    # Step 7: Save the final ranked list to a CSV file.
    output_filename = 'Analysis_4.csv'
    job_transition_scores.to_csv(output_filename, index=False)

    print(f"✅ Successfully created the analysis file: '{output_filename}'")
    print("\nHere's a preview of the jobs ranked from easiest to hardest to transition:")
    print(job_transition_scores.head())

except FileNotFoundError:
    print("❌ Error: 'job_similarity_matrix_with_tags.parquet' not found.")
    print("Please make sure the file is uploaded to your Colab environment.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

✅ Successfully created the analysis file: 'Analysis_4.csv'

Here's a preview of the jobs ranked from easiest to hardest to transition:
             Unique Code                                           Job Role  \
20   2022/AUT/ASDC/06558                     Automotive Machining Assistant   
808            TEL/Q0100  Telecom Customer Care Executive - Call Center/...   
233            HCS/Q2911         Cutting and Threading Operator (Metalware)   
841            THC/Q3303                                     Kitchen Helper   
324            LSC/Q1001                      Land Transportation Associate   

     Sector ID             Sector Name  Subsector ID  \
20           1              Automotive           106   
808         36                 Telecom          3601   
233         17  Handicrafts and Carpet          1703   
841         38   Tourism & Hospitality          3802   
324         22               Logistics          2202   

                 Subsector Name  \
20   Automotive Ve

Analysis 5 and 6

In [None]:
import pandas as pd

try:
    # Step 1: Load your main jobs dataset and the subsector categories lookup file.
    jobs_df = pd.read_csv("processed_jobs_dataset_v4_direct.csv")
    categories_df = pd.read_csv("Subsector_categories.csv")

    # Step 2: Perform a left merge.
    # This will add the 'Correct Sector Matched' and 'Tags' columns from categories_df
    # to jobs_df, matching rows where 'subsector_id' is the same in both files.
    updated_jobs_df = pd.merge(
        jobs_df,
        categories_df,
        on='subsector_id',
        how='left'
    )

    # Step 3: Save the updated DataFrame to a new CSV file.
    # It's good practice to save to a new file to keep your original data intact.
    output_filename = "processed_jobs_dataset_with_tags.csv"
    updated_jobs_df.to_csv(output_filename, index=False)

    print(f"✅ Successfully merged the files!")
    print(f"The updated data has been saved to '{output_filename}'")

    print("\nHere's a preview of the updated data with the new columns:")
    # Displaying relevant columns for a quick check
    preview_cols = list(jobs_df.columns) + ['Correct Sector Matched', 'Tags']
    print(updated_jobs_df[preview_cols].head())


except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure both CSV files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except KeyError as e:
    print(f"❌ Error: A required column was not found. Please check your CSV files for the column: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

✅ Successfully merged the files!
The updated data has been saved to 'processed_jobs_dataset_with_tags.csv'

Here's a preview of the updated data with the new columns:
                       Uniquecode  \
0            2022/IS/IISSSC/06814   
1     NG-04-AG-02549-2024-V1-ASCI   
2    QG-5.5-AU-03589-2025-V2-ASDC   
3    QG-5.5-AU-03589-2025-V2-ASDC   
4  QG-2.5-AU-00698-2023-V1.1-ASDC   

                                             jobRole  \
0                        EOT Overhead Crane Operator   
1  Repair and maintenance of solar powered farm e...   
2           Automotive Customer Relationship Manager   
3           Automotive Customer Relationship Manager   
4                 Electric Vehicle Service Assistant   

                                         jobRoleDesc  sector_id  \
0  The individual in this role operating overhead...         20   
1  This OS unit is about repair and maintenance o...         13   
2  A Customer Relationship Manager is responsible...          1   
3  A 

In [None]:
import pandas as pd
import json
import warnings

# Suppress potential warnings from empty JSON strings
warnings.filterwarnings("ignore", "Passing a string to a function is deprecated")

def extract_l2_skills(json_string):
    """
    Parses the skill_hierarchy JSON string and extracts all L2 skill IDs ('l2id').
    """
    l2_ids = set()
    try:
        data = json.loads(json_string)
        for l1_skill in data:
            if 'children' in l1_skill and l1_skill['children']:
                for l2_skill in l1_skill['children']:
                    if 'l2id' in l2_skill:
                        l2_ids.add(l2_skill['l2id'])
    except (json.JSONDecodeError, TypeError):
        return []
    return list(l2_ids)

try:
    # --- Data Preparation ---
    print("Step 1: Loading and preparing data...")

    jobs_df = pd.read_csv("processed_jobs_dataset_with_tags.csv")
    with open('skills_schema.json', 'r') as f:
        skills_schema = json.load(f)
    skills_df = pd.DataFrame(skills_schema)

    l2_skills_map = skills_df[skills_df['level'] == 2].set_index('id')['levelName'].to_dict()

    jobs_df['l2_skill_ids'] = jobs_df['skill_hierarchy'].apply(extract_l2_skills)
    exploded_df = jobs_df.explode('l2_skill_ids')

    exploded_df.dropna(subset=['l2_skill_ids', 'Tags'], inplace=True)
    exploded_df = exploded_df[exploded_df['l2_skill_ids'] != '']

    exploded_df['Skill Name'] = exploded_df['l2_skill_ids'].map(l2_skills_map)
    exploded_df.dropna(subset=['Skill Name'], inplace=True)
    print("Data preparation complete.")

    # --- Analysis 1: Skill Count for Each Category ---
    print("\nStep 2: Performing Analysis 1 - Top 10 Percentile Skills per Category...")

    # This calculates the count for each skill within each category ('Tags').
    skill_counts = exploded_df.groupby(['Tags', 'Skill Name']).size().reset_index(name='count')

    top_skills_list = []
    # The script automatically finds all unique categories, including both declining ones.
    categories = skill_counts['Tags'].unique()

    for category in categories:
        category_skills = skill_counts[skill_counts['Tags'] == category].copy()
        threshold = category_skills['count'].quantile(0.90)
        top_10_percentile = category_skills[category_skills['count'] >= threshold]
        top_10_percentile = top_10_percentile.sort_values('count', ascending=False)
        top_skills_list.append(top_10_percentile)

    top_skills_df = pd.concat(top_skills_list)
    output_filename_1 = "top_10_percentile_skills.csv"
    top_skills_df.to_csv(output_filename_1, index=False)
    print(f"✅ Analysis 1 complete. Results saved to '{output_filename_1}'")


    # --- Analysis 2: Green vs. Declining Skills (Improved) ---
    print("\nStep 3: Performing Analysis 2 - Green vs. Declining Skills...")

    skill_pivot = skill_counts.pivot_table(
        index='Skill Name',
        columns='Tags',
        values='count',
        fill_value=0
    )

    # Combine the counts from both declining sectors for a total declining score.
    declining_cols = ['Declining Sectors', 'Declining Sectors with net zero transition']
    for col in declining_cols:
        if col not in skill_pivot.columns:
            skill_pivot[col] = 0 # Add column if it doesn't exist

    skill_pivot['Total Declining Count'] = skill_pivot[declining_cols].sum(axis=1)

    # Ensure Green Sectors column exists
    if 'Green Sectors' not in skill_pivot.columns:
        skill_pivot['Green Sectors'] = 0

    # Sort to find skills with high counts in Green and low total counts in Declining
    green_vs_declining_df = skill_pivot.sort_values(
        by=['Green Sectors', 'Total Declining Count'],
        ascending=[False, True]
    )

    # Select and format the final output
    final_cols = ['Green Sectors', 'Total Declining Count'] + declining_cols
    green_vs_declining_df = green_vs_declining_df[final_cols].reset_index()

    output_filename_2 = "green_vs_declining_skills.csv"
    green_vs_declining_df.to_csv(output_filename_2, index=False)
    print(f"✅ Analysis 2 complete. Results saved to '{output_filename_2}'")

    print("\nScript finished successfully!")

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading and preparing data...
Data preparation complete.

Step 2: Performing Analysis 1 - Top 10 Percentile Skills per Category...
✅ Analysis 1 complete. Results saved to 'top_10_percentile_skills.csv'

Step 3: Performing Analysis 2 - Green vs. Declining Skills...
✅ Analysis 2 complete. Results saved to 'green_vs_declining_skills.csv'

Script finished successfully!


In [None]:
import pandas as pd
import json
import warnings

# Suppress potential warnings from empty JSON strings
warnings.filterwarnings("ignore", "Passing a string to a function is deprecated")

def extract_l2_skills(json_string):
    """
    Parses the skill_hierarchy JSON string and extracts all L2 skill IDs ('l2id').
    """
    l2_ids = set()
    try:
        data = json.loads(json_string)
        for l1_skill in data:
            if 'children' in l1_skill and l1_skill['children']:
                for l2_skill in l1_skill['children']:
                    if 'l2id' in l2_skill:
                        l2_ids.add(l2_skill['l2id'])
    except (json.JSONDecodeError, TypeError):
        return []
    return list(l2_ids)

try:
    # --- Data Preparation ---
    print("Step 1: Loading and preparing data...")

    jobs_df = pd.read_csv("processed_jobs_dataset_with_tags.csv")
    with open('skills_schema.json', 'r') as f:
        skills_schema = json.load(f)
    skills_df = pd.DataFrame(skills_schema)

    # --- NEW: Count and Print Number of Jobs per Category ---
    print("\n--- Number of Unique Jobs per Category ---")
    # Drop rows where 'Tags' is missing to ensure accurate counting
    jobs_df.dropna(subset=['Tags'], inplace=True)
    job_counts = jobs_df.groupby('Tags')['Uniquecode'].nunique()
    print(job_counts)
    print("----------------------------------------")


    l2_skills_map = skills_df[skills_df['level'] == 2].set_index('id')['levelName'].to_dict()

    jobs_df['l2_skill_ids'] = jobs_df['skill_hierarchy'].apply(extract_l2_skills)
    exploded_df = jobs_df.explode('l2_skill_ids')

    exploded_df.dropna(subset=['l2_skill_ids', 'Tags'], inplace=True)
    exploded_df = exploded_df[exploded_df['l2_skill_ids'] != '']

    exploded_df['Skill Name'] = exploded_df['l2_skill_ids'].map(l2_skills_map)
    exploded_df.dropna(subset=['Skill Name'], inplace=True)
    print("\nStep 1: Data preparation complete.")

    # --- Analysis 1: Top 20 Skills per Category (CHANGED) ---
    print("\nStep 2: Performing Analysis 1 - Top 20 Skills per Category...")

    skill_counts = exploded_df.groupby(['Tags', 'Skill Name'])['Uniquecode'].nunique().reset_index(name='job_count')
    top_skills_list = []
    categories = skill_counts['Tags'].unique()

    for category in categories:
        category_skills = skill_counts[skill_counts['Tags'] == category].copy()

        # CHANGED: Sort by count and take the top 20 skills instead of percentile
        top_20_skills = category_skills.sort_values('count', ascending=False).head(20)
        top_skills_list.append(top_20_skills)

    top_20_skills_df = pd.concat(top_skills_list)
    output_filename_1 = "top_20_skills_per_category_n.csv" # Changed filename
    top_20_skills_df.to_csv(output_filename_1, index=False)
    print(f"✅ Analysis 1 complete. Results saved to '{output_filename_1}'")


    # --- Analysis 2: Green vs. Declining Skills ---
    print("\nStep 3: Performing Analysis 2 - Green vs. Declining Skills...")

    skill_pivot = skill_counts.pivot_table(
        index='Skill Name',
        columns='Tags',
        values='count',
        fill_value=0
    )

    declining_cols = ['Declining Sectors', 'Declining Sectors with net zero transition']
    for col in declining_cols:
        if col not in skill_pivot.columns:
            skill_pivot[col] = 0

    skill_pivot['Total Declining Count'] = skill_pivot[declining_cols].sum(axis=1)

    if 'Green Sectors' not in skill_pivot.columns:
        skill_pivot['Green Sectors'] = 0

    green_vs_declining_df = skill_pivot.sort_values(
        by=['Green Sectors', 'Total Declining Count'],
        ascending=[False, True]
    )

    final_cols = ['Green Sectors', 'Total Declining Count'] + declining_cols
    green_vs_declining_df = green_vs_declining_df[final_cols].reset_index()

    output_filename_2 = "green_vs_declining_skills_n.csv"
    # green_vs_declining_df.to_csv(output_filename_2, index=False)
    print(f"✅ Analysis 2 complete. Results saved to '{output_filename_2}'")

    print("\nScript finished successfully!")

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading and preparing data...

--- Number of Unique Jobs per Category ---
Tags
Declining Sectors                              623
Declining Sectors with net zero transition     207
Expanding Sectors                             1193
Green Sectors                                   56
Not Found                                        1
Name: Uniquecode, dtype: int64
----------------------------------------

Step 1: Data preparation complete.

Step 2: Performing Analysis 1 - Top 20 Skills per Category...
An unexpected error occurred: 'count'


In [None]:
import pandas as pd
import json
import warnings

# Suppress potential warnings from empty JSON strings
warnings.filterwarnings("ignore", "Passing a string to a function is deprecated")

def extract_l2_skills(json_string):
    """
    Parses the skill_hierarchy JSON string and extracts all L2 skill IDs ('l2id').
    """
    l2_ids = set()
    try:
        data = json.loads(json_string)
        for l1_skill in data:
            if 'children' in l1_skill and l1_skill['children']:
                for l2_skill in l1_skill['children']:
                    if 'l2id' in l2_skill:
                        l2_ids.add(l2_skill['l2id'])
    except (json.JSONDecodeError, TypeError):
        return []
    return list(l2_ids)

try:
    # --- Data Preparation ---
    print("Step 1: Loading and preparing data...")

    jobs_df = pd.read_csv("processed_jobs_dataset_with_tags.csv")
    with open('skills_schema.json', 'r') as f:
        skills_schema = json.load(f)
    skills_df = pd.DataFrame(skills_schema)

    # --- Count and Print Number of Jobs per Category ---
    print("\n--- Number of Unique Jobs per Category ---")
    jobs_df.dropna(subset=['Tags'], inplace=True)
    job_counts = jobs_df.groupby('Tags')['Uniquecode'].nunique()
    print(job_counts)
    print("----------------------------------------")

    l2_skills_map = skills_df[skills_df['level'] == 2].set_index('id')['levelName'].to_dict()

    jobs_df['l2_skill_ids'] = jobs_df['skill_hierarchy'].apply(extract_l2_skills)
    exploded_df = jobs_df.explode('l2_skill_ids')

    exploded_df.dropna(subset=['l2_skill_ids', 'Tags'], inplace=True)
    exploded_df = exploded_df[exploded_df['l2_skill_ids'] != '']

    exploded_df['Skill Name'] = exploded_df['l2_skill_ids'].map(l2_skills_map)
    exploded_df.dropna(subset=['Skill Name'], inplace=True)
    print("\nStep 1: Data preparation complete.")

    # --- Analysis 1: Top 20 Skills per Category (CORRECTED) ---
    print("\nStep 2: Performing Analysis 1 - Top 20 Skills per Category...")

    skill_counts = exploded_df.groupby(['Tags', 'Skill Name'])['Uniquecode'].nunique().reset_index(name='job_count')

    top_skills_list = []
    categories = skill_counts['Tags'].unique()

    for category in categories:
        category_skills = skill_counts[skill_counts['Tags'] == category].copy()

        # CORRECTED: Sort by 'job_count'
        top_20_skills = category_skills.sort_values('job_count', ascending=False).head(85)
        top_skills_list.append(top_20_skills)

    top_20_skills_df = pd.concat(top_skills_list)
    output_filename_1 = "top_20_skills_per_category_n.csv"
    top_20_skills_df.to_csv(output_filename_1, index=False)
    print(f"✅ Analysis 1 complete. Results saved to '{output_filename_1}'")


    # --- Analysis 2: Green vs. Declining Skills (CORRECTED) ---
    print("\nStep 3: Performing Analysis 2 - Green vs. Declining Skills...")

    # CORRECTED: Use 'job_count' for pivot table values
    skill_pivot = skill_counts.pivot_table(
        index='Skill Name',
        columns='Tags',
        values='job_count',
        fill_value=0
    )

    declining_cols = ['Declining Sectors', 'Declining Sectors with net zero transition']
    for col in declining_cols:
        if col not in skill_pivot.columns:
            skill_pivot[col] = 0

    skill_pivot['Total Declining Count'] = skill_pivot[declining_cols].sum(axis=1)

    if 'Green Sectors' not in skill_pivot.columns:
        skill_pivot['Green Sectors'] = 0

    green_vs_declining_df = skill_pivot.sort_values(
        by=['Green Sectors', 'Total Declining Count'],
        ascending=[False, True]
    )

    final_cols = ['Green Sectors', 'Total Declining Count'] + declining_cols
    green_vs_declining_df = green_vs_declining_df[final_cols].reset_index()

    output_filename_2 = "green_vs_declining_skills_n.csv"
    green_vs_declining_df.to_csv(output_filename_2, index=False) # Un-commented to save the file
    print(f"✅ Analysis 2 complete. Results saved to '{output_filename_2}'")

    print("\nScript finished successfully!")

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found. Please ensure all required files are in the correct directory.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Step 1: Loading and preparing data...

--- Number of Unique Jobs per Category ---
Tags
Declining Sectors                              623
Declining Sectors with net zero transition     207
Expanding Sectors                             1193
Green Sectors                                   56
Not Found                                        1
Name: Uniquecode, dtype: int64
----------------------------------------

Step 1: Data preparation complete.

Step 2: Performing Analysis 1 - Top 20 Skills per Category...
✅ Analysis 1 complete. Results saved to 'top_20_skills_per_category_n.csv'

Step 3: Performing Analysis 2 - Green vs. Declining Skills...
✅ Analysis 2 complete. Results saved to 'green_vs_declining_skills_n.csv'

Script finished successfully!


Once this is being done, I then want to group the rows in this paraquet file by its Tags1 and then by its Correct Sector Matched1.
After this, I want to have a function which can take 2 Correct Sector Matched values and calculate the Mean and Median between all the rows (each row giving one value i.e. Composite Similarity) where Correct Sector Matched1 is the first argument and Correct Sector Matched2 is the second argument. This function searches for all the pairs and whichever has the same 2 column values, is being used to find the mean and median. This returns in the format (mean, median). Now, I want to use this function in my analysis, for all the rows having Correct Sector Matched1 as "Declining Sectors with net zero transition" or "Declining Sectors" and Correct Sector Matched2 as "Green Sectors" or "Expanding Sectors" (interchanged as well meaning Correct Sector Matched1 as "Green Sectors" or "Expanding Sectors" and Correct Sector Matched2 as "Declining Sectors with net zero transition" or "Declining Sectors"), for all these rows I want to use and then make a csv file which contains columns Tags1, Correct Sector Matchedx, Correct Sector Matchedy, Tags2, Average Similarity, Median Similarity. Here, I should be having

I have a csv named "sector_subsector_all.csv" which contains 2643 rows and a column named "sectors_subsectors" which contains datapoints in the below format i.e. """sectors_subsectors

{"sectorID": "39", "sectorName": "IT-ITeS", "subSectors": [{"subSectorID": "3907", "subSectorName": "Future Skills"}]}

{"sectorID": "32", "sectorName": "Retail", "subSectors": [{"subSectorID": "3203", "subSectorName": "Retail Business"}]}

{"sectorID": "24", "sectorName": "Media & Entertainment", "subSectors": [{"subSectorID": "2407", "subSectorName": "Advertising"}]}

{"sectorID": "38", "sectorName": "Tourism & Hospitality", "subSectors": [{"subSectorID": "3804", "subSectorName": "Hotels"}]}

{"sectorID": "24", "sectorName": "Media & Entertainment", "subSectors": [{"subSectorID": "2405", "subSectorName": "Television"}, {"subSectorID": "2410", "subSectorName": "Print"}, {"subSectorID": "2406", "subSectorName": "Radio"}, {"subSectorID": "2411", "subSectorName": "Digital"}, {"subSectorID": "2412", "subSectorName": "Out-of-home"}]}

"""

i want to add 6 new columns to it i.e. Sector Name (which will be sectorName extracted from each datapoint, sectorID, subSectorID, subSectorName (these 4 will be simply extracted from the column given above)
The next 2 columns names are "Correct Sector Matched", and	"Tags" which are extracted from a different csv named SUBSECTOR CATEGORIES.csv containing the columns named "subsector_id" (used to match it with subSectorID in our main csv) and get their corresponding values as in "Correct Sector Matched", and	"Tags" which are also the column names in this csv of subsector categories. give me the final python code to do the same.

In [None]:
import pandas as pd
import json

# --- 1. Load the CSV files ---
try:
    # Load the main dataframe
    main_df = pd.read_csv('sector_subsector_all.csv')
    # Load the lookup dataframe for categories and tags
    lookup_df = pd.read_csv('SUBSECTOR CATEGORIES.csv')
    print("✅ Files loaded successfully.")
except FileNotFoundError as e:
    print(f"❌ Error: {e}. Please ensure both CSV files are in the correct directory.")
    exit()

# --- 2. Parse the JSON data from the 'sectors_subsectors' column ---
# This list will store the processed data for each sub-sector
processed_rows = []

# Iterate over each row of the main dataframe
for index, row in main_df.iterrows():
    try:
        # Load the string as a JSON object (a Python dictionary)
        data = json.loads(row['sectors_subsectors'])

        # Extract top-level sector info
        sector_id = data.get('sectorID')
        sector_name = data.get('sectorName')

        # The 'subSectors' is a list, so we iterate through it
        # This handles cases with one or multiple sub-sectors
        if data.get('subSectors'):
            for sub_sector in data['subSectors']:
                sub_sector_id = sub_sector.get('subSectorID')
                sub_sector_name = sub_sector.get('subSectorName')

                # Create a new dictionary that includes the original row's data
                # and the newly extracted fields
                new_row = row.to_dict()
                new_row['sectorID'] = sector_id
                new_row['Sector Name'] = sector_name
                new_row['subSectorID'] = sub_sector_id
                new_row['subSectorName'] = sub_sector_name
                processed_rows.append(new_row)
    except (json.JSONDecodeError, TypeError):
        # Handle cases where the cell is empty, NaN, or has invalid format
        print(f"⚠️ Warning: Could not parse data in row {index}. Skipping.")
        continue

# Create a new dataframe from the processed list of rows
# This dataframe is "exploded" - each row corresponds to one sub-sector
expanded_df = pd.DataFrame(processed_rows)
print(f"📊 Data has been expanded. Original rows: {len(main_df)}, New rows: {len(expanded_df)}.")


# --- 3. Merge with the lookup table to get the final two columns ---

# Ensure the merge keys ('subSectorID' and 'subsector_id') are of the same data type
# Converting to string is the safest approach
expanded_df['subSectorID'] = expanded_df['subSectorID'].astype(str)
lookup_df['subsector_id'] = lookup_df['subsector_id'].astype(str)

# Perform a left merge to keep all data from the main file and add matched data from the lookup file
final_df = pd.merge(
    expanded_df,
    lookup_df[['subsector_id', 'Correct Sector Matched', 'Tags']],
    left_on='subSectorID',
    right_on='subsector_id',
    how='left'
)

# Clean up by dropping the extra 'subsector_id' column from the merge
final_df = final_df.drop(columns=['subsector_id'])


# --- 4. Save the updated dataframe back to the original CSV file ---

# Select the original columns plus the 6 new ones in the desired order
# Get original columns from the initial dataframe
original_cols = main_df.columns.tolist()
new_cols = ['sectorID', 'Sector Name', 'subSectorID', 'subSectorName', 'Correct Sector Matched', 'Tags']

# Combine lists and remove duplicates if any column names were the same
final_columns = original_cols + [col for col in new_cols if col not in original_cols]

# Reorder the dataframe and save it
final_df = final_df[final_columns]
final_df.to_csv('sector_subsector_all_no_skill_hierarchy.csv', index=False)

print("\n🎉 Success! The file 'sector_subsector_all.csv' has been updated with the 6 new columns.")
print("\nHere's a preview of the final data:")
print(final_df.head())

✅ Files loaded successfully.
📊 Data has been expanded. Original rows: 2642, New rows: 3261.

🎉 Success! The file 'sector_subsector_all.csv' has been updated with the 6 new columns.

Here's a preview of the final data:
  nco_clean                                            sectors  nco_3digit  \
0       5.5        {"sectorID": "39", "sectorName": "IT-ITeS"}         6.0   
1    1120.2  {"sectorID": "32", "sectorName": "Retail", "su...       112.0   
2   1120.34  {"sectorID": "24", "sectorName": "Media & Ente...       112.0   
3    1120.3  {"sectorID": "38", "sectorName": "Tourism & Ho...       112.0   
4   1120.34  {"sectorID": "24", "sectorName": "Media & Ente...       112.0   

          qpCode                                         occupation  version  \
0  NIE/SSC/Q1801  {"occupationID": "70", "occupationDesc": "Bioi...      1.0   
1      RAS/Q0201  {"occupationID": "2", "occupationDesc": "Consu...      4.0   
2      MES/Q0207  {"occupationID": "2", "occupationDesc": "Ad Sa...      

SyntaxError: invalid syntax (ipython-input-680513432.py, line 1)

In [None]:
import pandas as pd

try:
    # Step 1: Load the Parquet file.
    df = pd.read_parquet('job_similarity_matrix_with_tags.parquet')

    # --- Define the columns that need to be swapped ---

    # Columns for the first job in the pair
    cols_1 = [
        'Uniquecode1', 'jobRole1', 'jobRoleDesc1', 'sector_id1', 'sector_name1',
        'subsector_id1', 'subsector_name1', 'nsqfLevel1',
        'Correct Sector Matched_1', 'Tags_1'
    ]

    # Columns for the second job in the pair
    cols_2 = [
        'Uniquecode2', 'jobRole2', 'jobRoleDesc2', 'sector_id2', 'sector_name2',
        'subsector_id2', 'subsector_name2', 'nsqfLevel2',
        'Correct Sector Matched_2', 'Tags_2'
    ]

    # --- Define the conditions for swapping ---

    declining_tags = ['Declining Sectors', 'Declining Sectors with net zero transition']

    # Condition 1: Swap if a declining job is in the second position
    # and a non-declining job is in the first.
    condition1 = (
        df['Tags_2'].isin(declining_tags) &
        ~df['Tags_1'].isin(declining_tags)
    )

    # Condition 2: If both are declining, swap to ensure 'Declining Sectors'
    # comes before 'Declining Sectors with net zero transition'.
    condition2 = (
        (df['Tags_1'] == 'Declining Sectors with net zero transition') &
        (df['Tags_2'] == 'Declining Sectors')
    )

    # Combine the conditions to find all rows that need to be swapped.
    # Note: the user's request was phrased in reverse, so the logic has been adjusted to match the goal
    # of putting the 'higher-order' declining sector in the Tags_1 column.
    rows_to_swap = condition1 | condition2

    print(f"Found {rows_to_swap.sum()} rows to update based on your rules.")

    # --- Perform the swap ---

    if rows_to_swap.any():
        # Temporarily store the data from the columns to be swapped
        data_1 = df.loc[rows_to_swap, cols_1].copy()
        data_2 = df.loc[rows_to_swap, cols_2].copy()

        # Perform the interchange using the stored data
        df.loc[rows_to_swap, cols_1] = data_2.values
        df.loc[rows_to_swap, cols_2] = data_1.values

        print("Successfully interchanged the data.")
    else:
        print("No rows met the conditions for swapping.")

    # Step 4: Save the updated DataFrame to a new Parquet file.
    output_filename = 'job_similarity_matrix_updated.parquet'
    df.to_parquet(output_filename, index=False)

    print(f"\n✅ Successfully saved the updated data to '{output_filename}'")

except FileNotFoundError:
    print("❌ Error: 'job_similarity_matrix_with_tags_declining_swapped.parquet' not found.")
    print("Please make sure the file is available.")
except KeyError as e:
    print(f"❌ Error: A required column was not found in the file: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Found 874027 rows to update based on your rules.
Successfully interchanged the data.

✅ Successfully saved the updated data to 'job_similarity_matrix_updated.parquet'
