In [28]:
import pandas as pd

# Load CSV files into DataFrames
skills_pivoted = pd.read_csv('/content/skills_pivoted.csv')
Users_Lang_Skills = pd.read_csv('/content/Users_Lang_Skills.csv')
staffing_inverted = pd.read_csv('/content/staffing_inverted.csv')

In [29]:
!pip install fuzzywuzzy python-Levenshtein



# Matching Required Preffered Language Using  FUZZYWUZZY using a Level Multiplier from JSON to scale the scores and calculating a weight

In [34]:
# Final Code Block
import pandas as pd
import json
from fuzzywuzzy import process

# Global configuration
JSON_FILE_PATH = "/content/Scrum_Master_features.json"  # Set your exact file path here

# Global variables to store matches and weights
global required_matches, preferred_matches, language_matches
global required_skills_weight, preferred_skills_weight, language_weight

def prepare_required_skill_mapping(known_skills: list) -> pd.DataFrame:
    """
    Handles fuzzy matching with fixed JSON file path
    - 90 threshold for required skills/languages
    - 85 threshold for preferred skills
    """
    global required_matches, preferred_matches, language_matches
    global required_skills_weight, preferred_skills_weight, language_weight

    # Reset all globals
    required_matches = []
    preferred_matches = []
    language_matches = []
    required_skills_weight = 0
    preferred_skills_weight = 0
    language_weight = 0

    try:
        with open(JSON_FILE_PATH, "r", encoding="utf-8") as f:
            parsed = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"JSON file not found at: {JSON_FILE_PATH}")

    # Process Required Skills (90 threshold)
    for skill_dict in parsed.get("Required Skills", []):
        raw_skill = skill_dict["skill"]
        match = process.extractOne(raw_skill, known_skills, score_cutoff=90)
        if match:
            matched_skill, score = match
            required_matches.append({
                "matched_skill": matched_skill,
                "level": skill_dict["level"]
            })
            required_skills_weight += skill_dict["level"]

    # Process Preferred Skills (85 threshold)
    for skill_dict in parsed.get("Preferred Skills", []):
        raw_skill = skill_dict["skill"]
        match = process.extractOne(raw_skill, known_skills, score_cutoff=85)
        if match:
            matched_skill, score = match
            preferred_matches.append({
                "matched_skill": matched_skill,
                "level": skill_dict["level"]
            })
            preferred_skills_weight += skill_dict["level"]

    # Process Languages (90 threshold)
    known_langs = [col.replace("_level", "") for col in Users_Lang_Skills.columns
                   if col.endswith("_level")]
    for lang_dict in parsed.get("Languages", []):
        raw_lang = lang_dict["language"]
        match = process.extractOne(raw_lang, known_langs, score_cutoff=90)
        if match:
            matched_lang, score = match
            language_matches.append({
                "matched_lang": matched_lang,
                "level": lang_dict["level"]
            })
            language_weight += lang_dict["level"]

    return pd.DataFrame(required_matches)

def add_extracted_skill_scores_to_users() -> pd.DataFrame:
    """
    Creates scoring DataFrame with USER_ID and scores
    """
    global required_matches, language_matches
    global required_skills_weight, preferred_skills_weight, language_weight

    df = Users_Lang_Skills[["USER_ID"]].copy()

    # Add weight columns
    df["weight_required_skills"] = required_skills_weight
    df["weight_preferred_skills"] = preferred_skills_weight
    df["weight_language"] = language_weight

    # Add required skill scores
    for match in required_matches:
        skill_col = f"{match['matched_skill']}_level"
        score_col = f"req_{match['matched_skill']}_score"
        df[score_col] = Users_Lang_Skills[skill_col].fillna(0) * match['level']

    # Add language scores
    for lang_match in language_matches:
        lang_col = f"{lang_match['matched_lang']}_level"
        score_col = f"lang_{lang_match['matched_lang']}_score"
        df[score_col] = Users_Lang_Skills[lang_col].fillna(0) * lang_match['level']

    return df

def add_extracted_preferred_skill_scores_to_users(score_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds preferred skill scores to existing DataFrame
    """
    global preferred_matches

    df = score_df.copy()

    for match in preferred_matches:
        skill_col = f"{match['matched_skill']}_level"
        score_col = f"pref_{match['matched_skill']}_score"
        df[score_col] = Users_Lang_Skills[skill_col].fillna(0) * match['level']

    return df

# Function Trail use

In [35]:
# Get known skills (ensure _level suffix exists)
known_skills = [col.replace("_level", "") for col in Users_Lang_Skills.columns
                if col.endswith("_level")]

# Execute pipeline
mapping_df = prepare_required_skill_mapping(known_skills)
base_scores = add_extracted_skill_scores_to_users()
final_scores = add_extracted_preferred_skill_scores_to_users(base_scores)

# View results
print(final_scores.head())

   USER_ID  weight_required_skills  weight_preferred_skills  weight_language  \
0  2843838                       9                        0                5   
1  2479537                       9                        0                5   
2  2533337                       9                        0                5   
3  2446382                       9                        0                5   
4  2433124                       9                        0                5   

   req_R_score  req_Atlassian JIRA Software_score  \
0          0.0                                0.0   
1          0.0                                0.0   
2          0.0                                0.0   
3         80.0                                0.0   
4          0.0                                0.0   

   req_Microsoft Power BI_score  lang_French_score  lang_English_score  
0                           0.0              300.0               200.0  
1                           0.0              300.0    

In [36]:
final_scores.head()

Unnamed: 0,USER_ID,weight_required_skills,weight_preferred_skills,weight_language,req_R_score,req_Atlassian JIRA Software_score,req_Microsoft Power BI_score,lang_French_score,lang_English_score
0,2843838,9,0,5,0.0,0.0,0.0,300.0,200.0
1,2479537,9,0,5,0.0,0.0,0.0,300.0,180.0
2,2533337,9,0,5,0.0,0.0,0.0,300.0,100.0
3,2446382,9,0,5,80.0,0.0,160.0,300.0,120.0
4,2433124,9,0,5,0.0,0.0,0.0,300.0,200.0
