In [50]:
import pandas as pd

# Load CSV files into DataFrames
skills_pivoted = pd.read_csv('/content/skills_pivoted.csv')
Users_Lang_Skills = pd.read_csv('/content/Users_Lang_Skills.csv')
staffing_inverted = pd.read_csv('/content/staffing_inverted.csv')

In [7]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


# Fuzzy Wuzzy Matching

In [136]:
import json
import pandas as pd
from fuzzywuzzy import process

def prepare_required_skill_mapping(selected_job: str, known_skills: list, threshold: int = 90) -> pd.DataFrame:
    """
    Prepares the required skill mapping for a job and stores it in the global `mapping` variable.
    Also returns a DataFrame of Extracted Skill → Matched Skill for reference.

    Parameters:
        selected_job (str): The job title (e.g., "Data Engineer")
        known_skills (list): A list of standardized known skills (e.g., from skills_pivoted.columns)
        threshold (int): Fuzzy matching threshold (default: 90)

    Returns:
        pd.DataFrame: DataFrame of Extracted Skill and Matched Skill
    """
    global mapping

    file_path = f"{selected_job.replace(' ', '_')}_features.json"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            parsed = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"No JSON file found for selected job: {file_path}")

    required_skills = parsed.get("Required Skills", [])
    if not required_skills:
        print(f"⚠️ No 'Required Skills' found in JSON for {selected_job}")
        mapping = {}
        return pd.DataFrame(columns=["Extracted Skill", "Matched Skill"])

    # Standardize skill names
    def standardize_skill_name(name):
        return name.lower().strip().replace("-", " ").replace("_", " ")

    extracted_skills_std = [standardize_skill_name(skill) for skill in required_skills]

    # Perform fuzzy matching
    mapping = {}
    for skill in extracted_skills_std:
        match_result = process.extractOne(skill, known_skills)
        if match_result:
            matched_name, score = match_result
            if score >= threshold:
                mapping[skill] = matched_name

    return pd.DataFrame(list(mapping.items()), columns=["Extracted Skill", "Matched Skill"])


# Extracting matched skills scores

In [137]:
def add_extracted_skill_scores_to_users() -> pd.DataFrame:
    """
    Creates a new DataFrame with USER_ID, LAST_NAME, and Req_<matched_skill>_score columns
    based on the mapping dict. Each score is taken from the matched skill's _level column.
    NaNs are filled with 0.

    Returns:
        pd.DataFrame: A compact DataFrame with only USER_ID, LAST_NAME, and required skill scores.
    """
    base_cols = ["USER_ID", "LAST_NAME"]
    df = Users_Lang_Skills[base_cols].copy()

    for extracted_skill, matched_skill in mapping.items():
        matched_col = f"{matched_skill}_level"
        new_score_col = f"Req_{matched_skill}_score"

        if matched_col in Users_Lang_Skills.columns:
            df[new_score_col] = Users_Lang_Skills[matched_col].fillna(0)
        else:
            df[new_score_col] = 0

    return df




# Language Scores

In [138]:
import json
from fuzzywuzzy import process

def add_extracted_language_scores_to_users(score_df: pd.DataFrame, selected_job: str) -> pd.DataFrame:
    """
    Adds language score columns to score_df based on required languages from the job's JSON file.
    Matches extracted languages to language level columns in Users_Lang_Skills using fuzzy matching.

    Parameters:
        score_df (pd.DataFrame): The scoring DataFrame to add new language score columns to.
        selected_job (str): The job name used to locate the corresponding features JSON file.

    Returns:
        pd.DataFrame: The updated DataFrame with new language score columns.
    """
    # Load extracted features from the job-specific JSON file
    file_path = f"{selected_job.replace(' ', '_')}_features.json"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            features = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"JSON file for '{selected_job}' not found.")

    # Extract the list of required languages from the job features
    required_langs = features.get("Languages", [])
    if not required_langs:
        return score_df  # No languages to process

    # Clean up and standardize the extracted language names
    extracted_langs = [lang.lower().strip() for lang in required_langs]

    # Get all known language columns from Users_Lang_Skills (e.g., English_level, French_level)
    known_langs = {
        col.replace("_level", "").lower(): col
        for col in Users_Lang_Skills.columns
        if col.endswith("_level")
    }

    if not known_langs:
        return score_df  # No known language data to match against

    # Match each extracted language to a known one using fuzzy matching
    lang_mapping = {}
    for lang in extracted_langs:
        match = process.extractOne(lang, known_langs.keys())
        if match and match[1] >= 85:  # Only accept strong matches
            lang_mapping[lang] = known_langs[match[0]]

    if not lang_mapping:
        return score_df  # No usable matches

    # Add score columns to the DataFrame using values from Users_Lang_Skills
    df = score_df.copy().set_index("USER_ID")
    user_data = Users_Lang_Skills.set_index("USER_ID")

    for lang_key, lang_col in lang_mapping.items():
        score_col = f"{lang_key}_score"
        df[score_col] = user_data[lang_col].reindex(df.index).fillna(0)

    return df.reset_index()



# Experience Score

In [139]:
import json

def add_extracted_experience_score_to_users(score_df: pd.DataFrame, selected_job: str) -> pd.DataFrame:
    """
    Adds an 'experience_score' column to score_df based on the experience required
    for the selected job, loaded from a JSON features file.

    The scoring is binned:
        - ≥ required + 2 years → 100
        - > required + 2 to 4 → 70
        - > required + 4       → 50
        - == required          → 100
        - < required by ≤ 2    → 50
        - < required by 2–4    → 30
        - < required by >4     → 10
        - Missing experience   → 0
    """
    file_path = f"{selected_job.replace(' ', '_')}_features.json"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            features = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"No JSON file found for job '{selected_job}'")

    required_exp = features.get("Experience Required", 0)

    df = score_df.copy()

    def calculate_score(user_exp):
        if pd.isna(user_exp):
            return 0
        delta = user_exp - required_exp
        if delta >= 2:
            return 100
        elif 0 <= delta < 2:
            return 100
        elif -2 < delta < 0:
            return 50
        elif -4 < delta <= -2:
            return 30
        elif delta <= -4:
            return 10
        else:
            return 0

    # Map score using ANNEES_XP from Users_Lang_Skills
    experience_series = Users_Lang_Skills.set_index("USER_ID")["ANNEES_XP"]
    df = df.set_index("USER_ID")
    df["experience_score"] = experience_series.reindex(df.index).apply(calculate_score).fillna(0)

    return df.reset_index()


# Preferred Skills

In [140]:
import json
from fuzzywuzzy import process

def add_extracted_preferred_skill_scores_to_users(score_df: pd.DataFrame, selected_job: str) -> pd.DataFrame:
    """
    Adds Pref_<matched_skill>_score columns to score_df based on the preferred skills
    extracted from a job's JSON file. Uses fuzzy matching to match extracted skills
    to known _level columns in Users_Lang_Skills.

    Parameters:
        score_df (pd.DataFrame): Scoring DataFrame to update.
        selected_job (str): Job title used to locate the features JSON.

    Returns:
        pd.DataFrame: Updated DataFrame with preferred skill score columns.
    """
    file_path = f"{selected_job.replace(' ', '_')}_features.json"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            features = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"No JSON file found for job '{selected_job}'")

    preferred_skills = features.get("Preferred Skills", [])
    if not preferred_skills:
        return score_df

    def standardize(name):
        return name.lower().strip().replace("-", " ").replace("_", " ")

    preferred_skills_std = [standardize(skill) for skill in preferred_skills]

    known_skills = {
        col.replace("_level", "").lower(): col
        for col in Users_Lang_Skills.columns
        if col.endswith("_level")
    }

    if not known_skills:
        return score_df

    mapping = {}
    for skill in preferred_skills_std:
        match_result = process.extractOne(skill, list(known_skills.keys()))
        if match_result and match_result[1] >= 85:
            mapping[skill] = known_skills[match_result[0]]

    if not mapping:
        return score_df

    df = score_df.copy().set_index("USER_ID")
    user_data = Users_Lang_Skills.set_index("USER_ID")

    for _, matched_col in mapping.items():
        skill_name = matched_col.replace("_level", "")
        new_col = f"Pref_{skill_name}_score"
        df[new_col] = user_data[matched_col].reindex(df.index).fillna(0)

    return df.reset_index()


# Availability Score

In [141]:
def add_availability_score_to_users(score_df: pd.DataFrame, selected_job: str) -> pd.DataFrame:
    import json
    import re

    # === Step 1: Load job features JSON ===
    file_path = f"{selected_job.replace(' ', '_')}_features.json"
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            features = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"No JSON file found for job '{selected_job}'")

    # === Step 2: Extract duration from nested 'Additional Notes' ===
    notes_dict = features.get("Additional Notes", {})
    duration_text = str(notes_dict.get("Duration", ""))
    match = re.search(r"(\d+)\s*month", duration_text)
    duration = int(match.group(1)) if match else 0

    # === Step 3: Define scoring logic ===
    def check_availability_row(row, duration):
        values = row.tolist()
        max_score = 0
        for i in range(len(values) - duration + 1):
            window = values[i:i + duration]
            if all(v == 100 for v in window):
                return 100, "full block"
            elif 0 not in window:
                avg = sum(window) / duration
                max_score = max(max_score, avg)
        return max_score, "avg block" if max_score > 0 else "no block"

    # === Step 4: Score each user ===
    scores = []
    month_cols = [f"MONTH_{i}" for i in range(1, 13)]

    for user_id in score_df["USER_ID"]:
        staff_row = staffing_inverted[staffing_inverted["USER_ID"] == user_id]

        if staff_row.empty:
            scores.append(0)
            continue

        months = staff_row.iloc[0][month_cols]
        score, reason = check_availability_row(months, duration)
        scores.append(score)

    # === Step 5: Merge scores into final DataFrame ===
    result = score_df.copy()
    result["availability_score"] = scores

    return result



# Function Trail use

In [142]:
# Step 1: Generate required skill mapping
known_skills = [col.replace("_level", "") for col in skills_pivoted.columns]

prepare_required_skill_mapping("Data Engineer", known_skills)

# Start with skill scores
score_df = add_extracted_skill_scores_to_users()

# Then add language scores from JSON
score_df = add_extracted_language_scores_to_users(score_df, selected_job="Data Engineer")

# Then add experience scores from JSON
score_df = add_extracted_experience_score_to_users(score_df, selected_job="Data Engineer")

# Then add preferred skill scores from JSON
score_df = add_extracted_preferred_skill_scores_to_users(score_df, "Data Engineer")

# Then availability scores from JSON
score_df_Data_Engineer = add_availability_score_to_users(score_df, "Data Engineer")

In [131]:
score_df_Data_Engineer.to_csv("Data_Engineer_scored.csv", index=False)

In [132]:
# Step 1: Generate required skill mapping
known_skills = [col.replace("_level", "") for col in skills_pivoted.columns]

prepare_required_skill_mapping("Data Analyst", known_skills)

# Start with skill scores
score_df = add_extracted_skill_scores_to_users()

# Then add language scores from JSON
score_df = add_extracted_language_scores_to_users(score_df, selected_job="Data Analyst")

# Then add experience scores from JSON
score_df = add_extracted_experience_score_to_users(score_df, selected_job="Data Analyst")

# Then add preferred skill scores from JSON
score_df = add_extracted_preferred_skill_scores_to_users(score_df, "Data Analyst")

# Then availability scores from JSON
score_df_Data_Analyst = add_availability_score_to_users(score_df, "Data Analyst")

In [133]:
score_df_Data_Analyst.to_csv("Data_Analyst_scored.csv", index=False)


In [134]:
# Step 1: Generate required skill mapping
known_skills = [col.replace("_level", "") for col in skills_pivoted.columns]

prepare_required_skill_mapping("Scrum Master", known_skills)

# Start with skill scores
score_df = add_extracted_skill_scores_to_users()

# Then add language scores from JSON
score_df = add_extracted_language_scores_to_users(score_df, selected_job="Scrum Master")

# Then add experience scores from JSON
score_df = add_extracted_experience_score_to_users(score_df, selected_job="Scrum Master")

# Then add preferred skill scores from JSON
score_df = add_extracted_preferred_skill_scores_to_users(score_df, "Scrum Master")

# Then availability scores from JSON
score_df_Scrum_Master = add_availability_score_to_users(score_df, "Scrum Master")

In [135]:
score_df_Scrum_Master.to_csv("Scrum_Master_scored.csv", index=False)