In [14]:
import pandas as pd
import numpy as np
from transformers import pipeline, logging
import torch
import os

# Suppress transformer logs and disable tokenizers parallelism
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load datasets
associates = pd.read_csv("associates_with_grouped_skills.csv")
projects = pd.read_csv("projects_with_grouped_skills.csv")
learning_content = pd.read_csv("learning_content.csv")
preferences = pd.read_csv("learning_preferences.csv")

# Convert project_id to string in both DataFrames to ensure consistent matching
associates["project_id"] = associates["project_id"].astype(str)
projects["project_id"] = projects["project_id"].astype(str)

# Print columns for debugging
print("Columns in associates_with_grouped_skills.csv:", associates.columns.tolist())
print("Columns in projects_with_grouped_skills.csv:", projects.columns.tolist())
print("Columns in learning_preferences.csv:", preferences.columns.tolist())

# Debug project_id matching
print("Unique project_ids in associates (as lists):", 
      sorted([pid.strip() for combined in associates["project_id"].unique() for pid in combined.split(",")]))
print("Unique project_ids in projects:", sorted(projects["project_id"].unique()))

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns in associates
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager', 
                         'skills_recommended_by_department', 'skills_selected_for_own', 'L1', 'L2', 'L3', 'L4']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

# Identify skill columns in projects
metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# NLP for course recommendation
nlp = pipeline("feature-extraction", model="distilbert-base-uncased", framework="pt")
print("NLP pipeline loaded")

def get_embedding(text):
    with torch.no_grad():
        embedding = nlp(text)[0]
    return np.mean(embedding, axis=0)

# Use 'Title' column for embeddings
learning_content["embedding"] = learning_content["Title"].apply(get_embedding)
print("Course embeddings computed")

def recommend_course_nlp(skill, desired_level, delivery_mode=None):
    skill_embedding = get_embedding(skill)
    similarities = learning_content["embedding"].apply(
        lambda x: np.dot(x, skill_embedding) / (np.linalg.norm(x) * np.linalg.norm(skill_embedding))
    )
    candidates = learning_content.copy()
    candidates["similarity"] = similarities
    candidates = candidates[candidates["Level"] == desired_level]
    if delivery_mode:
        candidates = candidates[candidates["Format"] == delivery_mode]
    
    # Add a basic keyword match to improve relevance
    skill_lower = skill.lower()
    candidates["keyword_match"] = candidates["Title"].apply(
        lambda x: 1 if skill_lower in x.lower() else 0
    )
    candidates = candidates.sort_values(by=["keyword_match", "similarity"], ascending=[False, False])
    
    if not candidates.empty:
        best_match = candidates.iloc[0]["Title"]
        return best_match
    return "No suitable course found"

# Identify skill gaps and create learning paths (only project-based)
def create_learning_path(row):
    associate_id = row["associate_id"]
    project_recommendations = []

    # Get associate's skills
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}

    # Get preferred format from learning_preferences.csv
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None

    # Split the project_id string into a list of individual project IDs
    assoc_project_ids = [pid.strip() for pid in row["project_id"].split(",")]
    
    # Find all projects that match the individual project IDs
    assoc_projects = projects[projects["project_id"].isin(assoc_project_ids)]
    
    # Debugging: Check if projects are found
    if assoc_projects.empty:
        print(f"No projects found for associate {associate_id} with project_ids {assoc_project_ids}")
        return pd.Series({"project_recommendations": project_recommendations})
    
    # For each skill, determine the highest required level across all projects
    max_required_levels = {}
    for skill in skill_columns_proj:
        levels = assoc_projects[skill]
        # Filter out NaN and invalid levels, then take the maximum
        valid_levels = [skill_levels.get(level, 0) for level in levels if pd.notna(level) and level in skill_levels]
        if valid_levels:
            max_required_levels[skill] = {
                "level": inverse_skill_levels[max(valid_levels)],
                "level_int": max(valid_levels)
            }
    
    # Compare associate's skills against the highest required levels
    for req_skill, req_info in max_required_levels.items():
        req_level = req_info["level"]
        req_level_int = req_info["level_int"]
        current_level = assoc_skills.get(req_skill, None)
        current_level_int = skill_levels.get(current_level, 0) if pd.notna(current_level) and current_level in skill_levels else 0  # Treat NaN as level 0
        
        # Debug: Print skill comparison
        #print(f"Associate {associate_id}, Skill: {req_skill}, Current: {current_level} ({current_level_int}), Required: {req_level} ({req_level_int})")
        
        if current_level_int < req_level_int:
            course = recommend_course_nlp(req_skill, req_level, delivery_mode)
            project_recommendations.append({"skill": req_skill, "course": course})

    return pd.Series({"project_recommendations": project_recommendations})

# Apply the function and create project recommendations column
recommendations = associates.apply(create_learning_path, axis=1)
associates["project_recommendations"] = recommendations["project_recommendations"]
print("Project-based learning paths assigned")

# Save results
filtered_associates = associates[["associate_id", "role", "project_recommendations"]]
filtered_associates.to_csv("project_learning_pathways.csv", index=False)

print("✅ Project-based learning pathways saved successfully!")

Columns in associates_with_grouped_skills.csv: ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 'learning_time_utilized_previous_year', 'System Design', 'Cloud Architecture', 'Microservices Architecture', 'Performance Optimization', 'Scalability Design', 'CI/CD Pipeline Design', 'Event-Driven Architecture', 'CQRS Pattern', 'Hexagonal Architecture', 'Domain-Driven Design', 'Service Mesh', 'API Gateway Patterns', 'EDA Patterns', 'Saga Pattern', 'Circuit Breaker', 'BFF Pattern', 'Strangler Pattern', 'Anti-Corruption Layer', 'Modular Monolith', 'Quantum Computing Architecture', 'AI System Design', 'Blockchain Architecture', 'Edge Computing Architecture', 'Data Mesh Design', 'Digital Twin Architecture', 'Compliance by Design', 'Sustainability Architecture', 'Chaos Engineering Design', 'Cognitive Architecture', 'Git', 'Agile Methodologies', 'Problem Solving', 'Communication', 'Te

In [19]:
import pandas as pd
import numpy as np
from transformers import pipeline, logging
import torch
import os

# Suppress transformer logs and disable tokenizers parallelism
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load datasets
associates = pd.read_csv("associates_with_grouped_skills.csv")
projects = pd.read_csv("projects_with_grouped_skills.csv")
learning_content = pd.read_csv("learning_content.csv")
preferences = pd.read_csv("learning_preferences.csv")

# Convert project_id to string in both DataFrames to ensure consistent matching
associates["project_id"] = associates["project_id"].astype(str)
projects["project_id"] = projects["project_id"].astype(str)

# Print columns for debugging
print("Columns in associates_with_grouped_skills.csv:", associates.columns.tolist())
print("Columns in projects_with_grouped_skills.csv:", projects.columns.tolist())
print("Columns in learning_preferences.csv:", preferences.columns.tolist())

# Debug project_id matching
print("Unique project_ids in associates (as lists):", 
      sorted([pid.strip() for combined in associates["project_id"].unique() for pid in combined.split(",")]))
print("Unique project_ids in projects:", sorted(projects["project_id"].unique()))

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns in associates
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager', 
                         'skills_recommended_by_department', 'skills_selected_for_own', 'L1', 'L2', 'L3', 'L4']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

# Identify skill columns in projects
metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# NLP for course recommendation
nlp = pipeline("feature-extraction", model="distilbert-base-uncased", framework="pt")
print("NLP pipeline loaded")

def get_embedding(text):
    with torch.no_grad():
        embedding = nlp(text)[0]
    return np.mean(embedding, axis=0)

# Use 'Title' column for embeddings
learning_content["embedding"] = learning_content["Title"].apply(get_embedding)
print("Course embeddings computed")

def recommend_course_nlp(skill, desired_level, delivery_mode=None):
    skill_embedding = get_embedding(skill)
    similarities = learning_content["embedding"].apply(
        lambda x: np.dot(x, skill_embedding) / (np.linalg.norm(x) * np.linalg.norm(skill_embedding))
    )
    candidates = learning_content.copy()
    candidates["similarity"] = similarities
    candidates = candidates[candidates["Level"] == desired_level]
    if delivery_mode:
        candidates = candidates[candidates["Format"] == delivery_mode]
    
    # Add a basic keyword match to improve relevance
    skill_lower = skill.lower()
    candidates["keyword_match"] = candidates["Title"].apply(
        lambda x: 1 if skill_lower in x.lower() else 0
    )
    candidates = candidates.sort_values(by=["keyword_match", "similarity"], ascending=[False, False])
    
    if not candidates.empty:
        best_match = candidates.iloc[0]["Title"]
        return best_match
    return "No suitable course found"

# Identify skill gaps and create learning paths (only project-based)
def create_learning_path(row):
    associate_id = row["associate_id"]
    project_recommendations = []

    # Get associate's skills
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}

    # Get preferred format from learning_preferences.csv
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None

    # Split the project_id string into a list of individual project IDs
    assoc_project_ids = [pid.strip() for pid in row["project_id"].split(",")]
    
    # Find all projects that match the individual project IDs
    assoc_projects = projects[projects["project_id"].isin(assoc_project_ids)]
    
    # Debugging: Check if projects are found
    if assoc_projects.empty:
        print(f"No projects found for associate {associate_id} with project_ids {assoc_project_ids}")
        return pd.Series({"project_recommendations": project_recommendations})
    
    # For each skill, determine the highest required level across all projects
    max_required_levels = {}
    for skill in skill_columns_proj:
        levels = assoc_projects[skill]
        # Filter out NaN and invalid levels, then take the maximum
        valid_levels = [skill_levels.get(level, 0) for level in levels if pd.notna(level) and level in skill_levels]
        if valid_levels:
            max_required_levels[skill] = {
                "level": inverse_skill_levels[max(valid_levels)],
                "level_int": max(valid_levels)
            }
    
    # Compare associate's skills against the highest required levels
    for req_skill, req_info in max_required_levels.items():
        req_level = req_info["level"]
        req_level_int = req_info["level_int"]
        current_level = assoc_skills.get(req_skill, None)
        
        # Skip recommendation if the associate's skill level is NaN
        if pd.isna(current_level):
            continue
        
        current_level_int = skill_levels.get(current_level, 0) if current_level in skill_levels else 0
        
        # Debug: Print skill comparison
        print(f"Associate {associate_id}, Skill: {req_skill}, Current: {current_level} ({current_level_int}), Required: {req_level} ({req_level_int})")
        
        if current_level_int < req_level_int:
            course = recommend_course_nlp(req_skill, req_level, delivery_mode)
            project_recommendations.append({"skill": req_skill, "course": course})

    return pd.Series({"project_recommendations": project_recommendations})

# Apply the function and create project recommendations column
recommendations = associates.apply(create_learning_path, axis=1)
associates["project_recommendations"] = recommendations["project_recommendations"]
print("Project-based learning paths assigned")

# Save results
filtered_associates = associates[["associate_id", "role", "project_recommendations"]]
filtered_associates.to_csv("project_learning_pathways_new.csv", index=False)

print("✅ Project-based learning pathways saved successfully!")

Columns in associates_with_grouped_skills.csv: ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 'learning_time_utilized_previous_year', 'System Design', 'Cloud Architecture', 'Microservices Architecture', 'Performance Optimization', 'Scalability Design', 'CI/CD Pipeline Design', 'Event-Driven Architecture', 'CQRS Pattern', 'Hexagonal Architecture', 'Domain-Driven Design', 'Service Mesh', 'API Gateway Patterns', 'EDA Patterns', 'Saga Pattern', 'Circuit Breaker', 'BFF Pattern', 'Strangler Pattern', 'Anti-Corruption Layer', 'Modular Monolith', 'Quantum Computing Architecture', 'AI System Design', 'Blockchain Architecture', 'Edge Computing Architecture', 'Data Mesh Design', 'Digital Twin Architecture', 'Compliance by Design', 'Sustainability Architecture', 'Chaos Engineering Design', 'Cognitive Architecture', 'Git', 'Agile Methodologies', 'Problem Solving', 'Communication', 'Te

In [23]:
import pandas as pd
import numpy as np
from transformers import pipeline, logging
import torch
import os

# Suppress transformer logs and disable tokenizers parallelism
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load datasets
associates = pd.read_csv("associates_with_grouped_skills.csv")
projects = pd.read_csv("projects_with_grouped_skills.csv")
learning_content = pd.read_csv("learning_content.csv")
preferences = pd.read_csv("learning_preferences.csv")

# Convert project_id to string in both DataFrames to ensure consistent matching
associates["project_id"] = associates["project_id"].astype(str)
projects["project_id"] = projects["project_id"].astype(str)

# Print columns for debugging
print("Columns in associates_with_grouped_skills.csv:", associates.columns.tolist())
print("Columns in projects_with_grouped_skills.csv:", projects.columns.tolist())
print("Columns in learning_preferences.csv:", preferences.columns.tolist())

# Debug project_id matching
print("Unique project_ids in associates (as lists):", 
      sorted([pid.strip() for combined in associates["project_id"].unique() for pid in combined.split(",")]))
print("Unique project_ids in projects:", sorted(projects["project_id"].unique()))

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns in associates
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager', 
                         'skills_recommended_by_department', 'skills_selected_for_own', 'L1', 'L2', 'L3', 'L4']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

# Identify skill columns in projects
metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# NLP for course recommendation
nlp = pipeline("feature-extraction", model="distilbert-base-uncased", framework="pt")
print("NLP pipeline loaded")

def get_embedding(text):
    with torch.no_grad():
        embedding = nlp(text)[0]
    return np.mean(embedding, axis=0)

# Use 'Title' column for embeddings
learning_content["embedding"] = learning_content["Title"].apply(get_embedding)
print("Course embeddings computed")

def recommend_course_nlp(skill, desired_level, delivery_mode=None):
    skill_embedding = get_embedding(skill)
    similarities = learning_content["embedding"].apply(
        lambda x: np.dot(x, skill_embedding) / (np.linalg.norm(x) * np.linalg.norm(skill_embedding))
    )
    candidates = learning_content.copy()
    candidates["similarity"] = similarities
    candidates = candidates[candidates["Level"] == desired_level]
    if delivery_mode:
        candidates = candidates[candidates["Format"] == delivery_mode]
    
    # Add a basic keyword match to improve relevance
    skill_lower = skill.lower()
    candidates["keyword_match"] = candidates["Title"].apply(
        lambda x: 1 if skill_lower in x.lower() else 0
    )
    candidates = candidates.sort_values(by=["keyword_match", "similarity"], ascending=[False, False])
    
    if not candidates.empty:
        best_match = candidates.iloc[0]["Title"]
        return best_match
    return "No suitable course found"

# Identify skill gaps and create learning paths (only project-based)
def create_learning_path(row):
    associate_id = row["associate_id"]
    project_recommendations = []
    skill_gaps = []

    # Get associate's skills
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}

    # Get preferred format from learning_preferences.csv
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None

    # Split the project_id string into a list of individual project IDs
    assoc_project_ids = [pid.strip() for pid in row["project_id"].split(",")]
    
    # Find all projects that match the individual project IDs
    assoc_projects = projects[projects["project_id"].isin(assoc_project_ids)]
    
    # Debugging: Check if projects are found
    if assoc_projects.empty:
        print(f"No projects found for associate {associate_id} with project_ids {assoc_project_ids}")
        return pd.Series({"project_recommendations": project_recommendations, "skill_gaps": skill_gaps})
    
    # For each skill, determine the highest required level across all projects
    max_required_levels = {}
    for skill in skill_columns_proj:
        levels = assoc_projects[skill]
        # Filter out NaN and invalid levels, then take the maximum
        valid_levels = [skill_levels.get(level, 0) for level in levels if pd.notna(level) and level in skill_levels]
        if valid_levels:
            max_required_levels[skill] = {
                "level": inverse_skill_levels[max(valid_levels)],
                "level_int": max(valid_levels)
            }
    
    # Compare associate's skills against the highest required levels
    for req_skill, req_info in max_required_levels.items():
        req_level = req_info["level"]
        req_level_int = req_info["level_int"]
        current_level = assoc_skills.get(req_skill, None)
        
        # Skip recommendation if the associate's skill level is NaN
        if pd.isna(current_level):
            continue
        
        current_level_int = skill_levels.get(current_level, 0) if current_level in skill_levels else 0
        
        # Debug: Print skill comparison
        #print(f"Associate {associate_id}, Skill: {req_skill}, Current: {current_level} ({current_level_int}), Required: {req_level} ({req_level_int})")
        
        if current_level_int < req_level_int:
            course = recommend_course_nlp(req_skill, req_level, delivery_mode)
            project_recommendations.append({"skill": req_skill, "course": course})
            skill_gaps.append({"skill": req_skill, "gap": f"{current_level} -> {req_level}"})

    return pd.Series({"project_recommendations": project_recommendations, "skill_gaps": skill_gaps})

# Apply the function and create project recommendations and skill gaps columns
recommendations = associates.apply(create_learning_path, axis=1)
associates["project_recommendations"] = recommendations["project_recommendations"]
associates["skill_gaps"] = recommendations["skill_gaps"]
print("Project-based learning paths and skill gaps assigned")

# Save results
filtered_associates = associates[["associate_id", "role", "project_recommendations", "skill_gaps"]]
filtered_associates.to_csv("project_learning_pathways_NewWithSkillGap.csv", index=False)

print("✅ Project-based learning pathways with skill gaps saved successfully!")

Columns in associates_with_grouped_skills.csv: ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 'learning_time_utilized_previous_year', 'System Design', 'Cloud Architecture', 'Microservices Architecture', 'Performance Optimization', 'Scalability Design', 'CI/CD Pipeline Design', 'Event-Driven Architecture', 'CQRS Pattern', 'Hexagonal Architecture', 'Domain-Driven Design', 'Service Mesh', 'API Gateway Patterns', 'EDA Patterns', 'Saga Pattern', 'Circuit Breaker', 'BFF Pattern', 'Strangler Pattern', 'Anti-Corruption Layer', 'Modular Monolith', 'Quantum Computing Architecture', 'AI System Design', 'Blockchain Architecture', 'Edge Computing Architecture', 'Data Mesh Design', 'Digital Twin Architecture', 'Compliance by Design', 'Sustainability Architecture', 'Chaos Engineering Design', 'Cognitive Architecture', 'Git', 'Agile Methodologies', 'Problem Solving', 'Communication', 'Te