In [9]:
import json

with open("../skill_mappings/skill_mapping.json") as f1:
    json1 = json.load(f1)

with open("../skill_mappings/final_processed.json") as f2:
    json2 = json.load(f2)

keys1 = set(json1.keys())
keys2 = set(json2.keys())

only_in_1 = keys1 - keys2
only_in_2 = keys2 - keys1
common_keys = keys1 & keys2

print("✅ Common Keys:", common_keys)
print("❌ Only in JSON 1:", only_in_1)
print("❌ Only in JSON 2:", only_in_2)


✅ Common Keys: {'financial reporting', 'administrative tasks', 'relational database', 'nodejs', 'identity and access management', 'risk management framework (rmf)', 'hardware procurement', 'software asset management', 'security frameworks', 'performance tuning', 'nosql databases', 'problem identification', 'cyber security', 'business process analysis', 'network virtualization', 'service delivery', 'api testing', 'infrastructure as code', 'it infrastructure', 'sap bw', 'software design principles', 'software development life cycle (sdlc)', 'rest apis', 'build tools', 'microsoft office suite', 'authentication and authorization', 'software design and implementation', 'cisco ise', 'network storage', 'healthcare information systems', 'network protocols', 'software engineering tools', 'business intelligence (bi)', 'business intelligence tools', 'databases', 'semantic web', 'remote monitoring', 'remote sensing', 'requirements elicitation', 'software development lifecycle', 'erp software', 'in

In [10]:


non_common_json = {}     # unique + identical pairs
json1_conflicts = {}    # only conflicting keys from file1
json2_conflicts = {}    # only conflicting keys from file2

# ✅ Keys only in file1
for k in keys1 - keys2:
    non_common_json[k] = json1[k]

# ✅ Keys only in file2
for k in keys2 - keys1:
    non_common_json[k] = json2[k]

# ✅ Keys in both
for k in keys1 & keys2:
    if json1[k] == json2[k]:
        # ✅ identical key-value → goes to non_common_keys.json
        non_common_json[k] = json1[k]
    else:
        # ✅ conflicting key-value → stays in separate files
        json1_conflicts[k] = json1[k]
        json2_conflicts[k] = json2[k]

# ✅ Save results
with open("../skill_mappings/merged_with_final.json", "w") as f:
    json.dump(non_common_json, f, indent=4)

with open("../skill_mappings/nm1_common.json", "w") as f:
    json.dump(json1_conflicts, f, indent=4)

with open("../skill_mappings/nm2_common.json", "w") as f:
    json.dump(json2_conflicts, f, indent=4)

print("✅ Created:")
print(" - non_common_keys.json  (unique + identical pairs)")
print(" - file1_common.json    (only conflicting keys)")
print(" - file2_common.json    (only conflicting keys)")


✅ Created:
 - non_common_keys.json  (unique + identical pairs)
 - file1_common.json    (only conflicting keys)
 - file2_common.json    (only conflicting keys)


In [5]:
import json

# Load input JSON
with open("../skill_mappings/skill_mapping.json") as f:
    data = json.load(f)

matched = {}      # Will contain a:b and b:x together
remaining = {}    # Everything else

keys = set(data.keys())
used_keys = set()  # Track keys already placed in matched

for k, v in data.items():
    # Condition: k -> v AND v is also a key (forming a chain)
    if isinstance(v, str) and v in keys:
        matched[k] = data[k]     # a:b
        matched[v] = data[v]     # b:x
        used_keys.add(k)
        used_keys.add(v)

# Everything not used in matched goes to remaining
for k, v in data.items():
    if k not in used_keys:
        remaining[k] = v

# ✅ Save outputs
with open("chained_mappings.json", "w") as f:
    json.dump(matched, f, indent=4)

with open("skill_mapping_final.json", "w") as f:
    json.dump(remaining, f, indent=4)

print("✅ Created:")
print(" - chained_mappings.json   (a:b and b:x together)")
print(" - remaining_mappings.json (normal key-value pairs)")


✅ Created:
 - chained_mappings.json   (a:b and b:x together)
 - remaining_mappings.json (normal key-value pairs)


In [10]:
import json

# Load input JSON
# with open("input.json") as f:
#     data = json.load(f)

file_name = "../skill_mappings/skill_mapping.json"
# Load JSON
with open(file_name, "r") as f:
    data = json.load(f)
    
matched = {}      # Will contain a:b and b:x together
remaining = {}    # Everything else

keys = set(data.keys())
used_keys = set()

# Detect chained mappings
for k, v in data.items():
    if isinstance(v, str) and v in keys:
        matched[k] = data[k]     # a:b
        matched[v] = data[v]     # b:x
        used_keys.add(k)
        used_keys.add(v)

# Everything not used goes to remaining
for k, v in data.items():
    if k not in used_keys:
        remaining[k] = v

# ✅ SAVE CHAINED AS JSONL (ONE PER LINE)
with open("chained_mappings.jsonl", "w") as f:
    for k, v in matched.items():
        line = {k: v}
        f.write(json.dumps(line) + "\n")

# ✅ SAVE REMAINING AS NORMAL JSON
with open("remaining_mappings.json", "w") as f:
    json.dump(remaining, f, indent=4)

print("✅ Created:")
print(" - chained_mappings.jsonl  (one mapping per line)")
print(" - remaining_mappings.json (normal JSON)")


✅ Created:
 - chained_mappings.jsonl  (one mapping per line)
 - remaining_mappings.json (normal JSON)


In [5]:
import json
#key==value removed (redundant mappings)
file_name = "../skill_mappings/nm2_common.json"
# Load JSON
with open(file_name, "r") as f:
    data = json.load(f)

# Remove key-value pairs where key == value
data = {k: v for k, v in data.items() if k != v}

# Save back to the same file
with open(file_name, "w") as f:
    json.dump(data, f, indent=4)

print(f"✅ Updated {file_name}: removed key==value pairs")


✅ Updated ../skill_mappings/nm2_common.json: removed key==value pairs


In [None]:
import json

# Load JSON
file_name = "../skill_mappings/skill_mapping.json"
# Load JSON
with open(file_name, "r") as f:
    data = json.load(f)

keys = set(data.keys())
used = set()
chains = []
remaining = {}

# Build chains efficiently
for k in data:
    if k in used:
        continue
    chain = []
    current = k
    while isinstance(data.get(current), str) and data[current] in keys and current not in used:
        chain.append((current, data[current]))
        used.add(current)
        current = data[current]
    if chain:
        chains.append(chain)
    elif k not in used:
        remaining[k] = data[k]

# Add anything left to remaining
for k, v in data.items():
    if k not in used:
        remaining[k] = v

# Save chains in readable format
with open("chained_mappings.json", "w") as f:
    f.write("{\n")
    for chain in chains:
        for k, v in chain:
            f.write(f'    "{k}": "{v}",\n')
        f.write("\n")
    f.write("}\n")

# Save remaining normally
with open("remaining_mappings.json", "w") as f:
    json.dump(remaining, f, indent=4)

print("✅ Fast JSON files created with grouped chains")


In [None]:
import json

# Load JSON
file_name = "../skill_mappings/skill_mapping.json"
# Load JSON
with open(file_name, "r") as f:
    data = json.load(f)

# Separate irrelevant keys
irrelevant_keys = ["irrelevant_skill", "irrelevant_skills"]
irrelevant = {}
remaining = {}

for k, v in data.items():
    if v in irrelevant_keys:
        irrelevant[k] = v
    else:
        remaining[k] = v

# Save irrelevant keys to new JSON
with open("../skill_mappings/irrelevant.json", "w") as f:
    json.dump(irrelevant, f, indent=4)

# Save the updated original JSON without irrelevant keys
with open("../skill_mappings/skill_mapping_final.json", "w") as f:
    json.dump(remaining, f, indent=4)

print(f"✅ Created 'irrelevant.json' ({len(irrelevant)} entries)")
print(f"✅ Created 'updated_input.json' without irrelevant keys")


✅ Created 'irrelevant.json' (1423 entries)
✅ Created 'updated_input.json' without irrelevant keys


In [8]:
import json

# Load JSON
file_name = "../skill_mappings/skill_mapping_final.json"
# Load JSON
with open(file_name, "r") as f:
    data = json.load(f)

# Prepare dictionaries
matching = {}
remaining = {}

# Case-insensitive search
for k, v in data.items():
    if ("degree" in k.lower() or "education" in k.lower() or
        (isinstance(v, str) and ("degree" in v.lower() or "education" in v.lower()))):
        matching[k] = v
    else:
        remaining[k] = v


# Save irrelevant keys to new JSON
with open("../skill_mappings/education_degree.json", "w") as f:
    json.dump(matching, f, indent=4)

# Save the updated original JSON without irrelevant keys
with open("../skill_mappings/skill_mapping.json", "w") as f:
    json.dump(remaining, f, indent=4)

print(f"✅ Created 'irrelevant.json' ({len(irrelevant)} entries)")
print(f"✅ Created 'updated_input.json' without irrelevant keys")


✅ Created 'irrelevant.json' (1423 entries)
✅ Created 'updated_input.json' without irrelevant keys


In [None]:
strings=list(df_tech_bin_filtered.columns)[:]
from collections import Counter

first_words = [s.split()[0] for s in strings]
counts = Counter(first_words)

print(counts)


prefix = "project"
map_value = "project management"

matches = [s for s in strings if s.startswith(prefix)]

print(f"Strings starting with '{prefix}':")
for m in matches:
    print(" -", m)

mapping = {s: map_value for s in matches if s.startswith(prefix)}

mapping
# Mapping dictionary
data_mapping1 = {
  "project": "project management",
  "it": "it"

}

data_mapping=ADDITIONAL_SKILLS3
# Generate mapping for strings that start with any key in data_mapping
mapped_strings = {}

for s in strings:
    for key, value in data_mapping.items():
        if s.lower().startswith(key.lower()):  # case-insensitive match
            mapped_strings[s] = value
            # break  # stop after first match

# Output
# mapped_strings
with open("new_mappings4.json", "w") as f:
    json.dump(mapped_strings, f, indent=4)

print("Dictionary saved as data_mapping.json")
import pandas as pd
import json

# Your dictionary
skill_mappings = NEW_MAPPINGS
file_name = "new_mappings3.json"

# Create DataFrame with all operations
df_final = (pd.DataFrame(list(skill_mappings.items()), columns=['skill', 'mapping'])
            .drop_duplicates()
            .sort_values(by='skill')
            .reset_index(drop=True))

print("Final DataFrame:")
print(df_final)

# Convert to dictionary format {skill: mapping}
skill_mapping_dict = df_final.set_index('skill')['mapping'].to_dict()

# Save to JSON
with open(file_name, 'w') as f:
    json.dump(skill_mapping_dict, f, indent=2)

print(f'\n✓ JSON file saved as {file_name}')

# # Display the JSON content
# with open(file_name, 'r') as f:
#     print("\nJSON Content:")
#     print(f.read())


In [None]:
strings=list(df_tech_bin_filtered.columns)[:]
from collections import Counter

first_words = [s.split()[0] for s in strings]
counts = Counter(first_words)

print(counts)



In [None]:


prefix = "project"
map_value = "project management"

matches = [s for s in strings if s.startswith(prefix)]

print(f"Strings starting with '{prefix}':")
for m in matches:
    print(" -", m)

mapping = {s: map_value for s in matches if s.startswith(prefix)}

mapping

In [None]:
# Mapping dictionary
data_mapping1 = {
  "project": "project management",
  "it": "it"

}

data_mapping=ADDITIONAL_SKILLS3
# Generate mapping for strings that start with any key in data_mapping
mapped_strings = {}

for s in strings:
    for key, value in data_mapping.items():
        if s.lower().startswith(key.lower()):  # case-insensitive match
            mapped_strings[s] = value
            # break  # stop after first match

# Output
# mapped_strings
with open("new_mappings4.json", "w") as f:
    json.dump(mapped_strings, f, indent=4)

print("Dictionary saved as data_mapping.json")

In [None]:
import pandas as pd
import json

# Your dictionary
skill_mappings = NEW_MAPPINGS
file_name = "new_mappings3.json"

# Create DataFrame with all operations
df_final = (pd.DataFrame(list(skill_mappings.items()), columns=['skill', 'mapping'])
            .drop_duplicates()
            .sort_values(by='skill')
            .reset_index(drop=True))

print("Final DataFrame:")
print(df_final)

# Convert to dictionary format {skill: mapping}
skill_mapping_dict = df_final.set_index('skill')['mapping'].to_dict()

# Save to JSON
with open(file_name, 'w') as f:
    json.dump(skill_mapping_dict, f, indent=2)

print(f'\n✓ JSON file saved as {file_name}')

# # Display the JSON content
# with open(file_name, 'r') as f:
#     print("\nJSON Content:")
#     print(f.read())

In [None]:
with open("new_mapping4.json", "r") as f:
    json1 = json.load(f)

with open("incorrect_ones.json", "r") as f:
    json2 = json.load(f)

In [None]:
import json

# Your original dictionaries
# json1 = {"a": 1, "b": 2, "c": 3, "d": 4}
# json2 = {"b": 5, "d": 6}

# Remove keys from json1 that exist in json2
filtered_json1 = {key: value for key, value in json1.items() if key not in json2}

# Combine filtered_json1 and json2
json3 = {**filtered_json1, **json2}

print(json3)  # Output: {'a': 1, 'c': 3, 'b': 5, 'd': 6}

# Save to a file
with open('new_mapping4_fixed.json', 'w') as f:
    json.dump(json3, f, indent=4)

print("Saved to new_mapping4_fixed.json")

In [None]:
# Role mapping dictionary
role_mapping = {
    'Software Engineer': ['software engineer', 'software developer', 'application developer'],
    'Backend Engineer': ['backend engineer', 'backend developer', 'backend software engineer', 'server-side engineer'],
    'Frontend Engineer': ['frontend engineer', 'frontend developer', 'web frontend developer', 'ui engineer'],
    'Full Stack Engineer': ['full-stack engineer', 'full-stack developer', 'full stack developer'],
    'Mobile Engineer': ['mobile engineer', 'mobile developer', 'mobile app developer', 'app developer', 'ios developer', 'android developer'],
    'DevOps Engineer': ['devops', 'devops engineer', 'devops developer', 'devops specialist', 'devops consultant'],
    'Site Reliability Engineer': ['sre', 'site reliability engineer', 'reliability engineer', 'cloud sre'],
    'Cloud Engineer': ['cloud engineer', 'cloud developer', 'cloud platform engineer', 'cloud solutions engineer', 'cloud infrastructure engineer', 'cloud systems engineer', 'cloud infrastructure architect', 'cloud system architect', 'cloud solutions architect', 'cloud architect'],
    'Data Engineer': ['data engineer', 'big data engineer'],
    'Data Scientist': ['data scientist', 'data analyst', 'business analyst', 'business intelligence analyst'],
    'ML Engineer': ['ml engineer', 'machine learning engineer', 'ml developer', 'mlops engineer'],
    'ML Researcher': ['ml researcher', 'ml research engineer', 'research scientist ml', 'ai researcher'],
    'AI Engineer': ['ai engineer', 'artificial intelligence engineer', 'applied ai engineer', 'applied scientist'],
    'Computer Vision Engineer': ['computer vision engineer', 'cv engineer', 'computer vision developer', 'image processing engineer'],
    'NLP Engineer': ['nlp engineer'],
    'Deep Learning Engineer': ['deep learning engineer'],
    'Research Scientist': ['research scientist'],
    'Data Product Manager': ['data product manager'],
    'Platform Engineer': ['platform engineer', 'platform developer', 'platform operations engineer'],
    'Infrastructure Engineer': ['infrastructure engineer', 'it infrastructure engineer'],
    'Systems Engineer': ['systems engineer', 'systems administrator'],
    'Embedded Software Engineer': ['embedded software engineer', 'embedded robotics engineer'],
    'Robotics Engineer': ['robotics engineer', 'robotics software engineer', 'autonomous systems engineer'],
    'Security Engineer': ['security engineer', 'cybersecurity engineer', 'cybersecurity analyst', 'information security engineer', 'it security engineer', 'security analyst'],
    'Network Engineer': ['network engineer'],
    'Game Developer': ['game developer', 'game software engineer', 'game programmer', 'unity developer', 'unreal engine developer'],
    'Simulation Engineer': ['simulation engineer', 'simulation software engineer', 'simulation developer', 'modeling engineer'],
    'AR/VR Engineer': ['ar/vr engineer'],
    'Blockchain Engineer': ['blockchain engineer'],
    'API Developer': ['api developer'],
    'React Developer': ['react developer'],
    'Angular Developer': ['angular developer'],
    'Product Manager': ['product manager'],
    'Technical Program Manager': ['technical program manager'],
    'Quantitative Analyst': ['quantitative analyst', 'operations analyst'],
    'IT Support Engineer': ['it support engineer', 'it support specialist', 'technical support engineer', 'desktop support engineer', 'it helpdesk'],
    'Automation Engineer': ['automation engineer']
}

In [None]:
# Create reverse mapping for quick lookup
title_to_consolidated = {}
for consolidated_role, variants in role_mapping.items():
    for variant in variants:
        title_to_consolidated[variant] = consolidated_role

# Map job titles to consolidated roles
skills_job_df['class_label'] = skills_job_df['job_title'].str.lower().map(title_to_consolidated)

# Remove rows that couldn't be mapped
skills_job_df_mapped = skills_job_df.dropna(subset=['class_label']).copy()

# Get skill columns (all columns except job_title and consolidated_role)
skill_columns = [col for col in skills_job_df_mapped.columns if col not in ['job_title', 'class_label']]

# Reorder columns: consolidated_role first, then skills
final_df = skills_job_df_mapped[['class_label'] + skill_columns].copy()

# Print summary
print(f"Total rows mapped: {len(final_df)}")
print(f"Total unique consolidated roles: {final_df['class_label'].nunique()}")
print(f"\nRole distribution:")
print(final_df['class_label'].value_counts())