In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline, logging
from sklearn.cluster import KMeans
import torch
import os

# Suppress transformer logs and disable tokenizers parallelism
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load datasets
associates = pd.read_csv("associates_with_grouped_skills.csv")
projects = pd.read_csv("projects_with_grouped_skills.csv")
learning_content = pd.read_csv("learning_content.csv")
preferences = pd.read_csv("learning_preferences.csv")

# Print columns for debugging
print("Columns in associates_with_grouped_skills.csv:", associates.columns.tolist())
print("Columns in projects_with_grouped_skills.csv:", projects.columns.tolist())
print("Columns in learning_preferences.csv:", preferences.columns.tolist())

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns in associates
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager', 
                         'skills_recommended_by_department', 'skills_selected_for_own']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

# Identify skill columns in projects
metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# Prepare ML data for skill gap prediction
def prepare_ml_data():
    data = []
    for _, row in associates.iterrows():
        for skill in skill_columns_assoc:
            current_level = row[skill]
            if pd.notna(current_level):
                current_level_int = skill_levels.get(current_level, 1)
                desired_level = min(current_level_int + np.random.randint(1, 3), 4)
                data.append({
                    "associate_id": row["associate_id"],
                    "skill": skill,
                    "current_level": current_level_int,
                    "desired_level": desired_level,
                    "years_of_experience": row["years_of_experience"]
                })
    return pd.DataFrame(data)

ml_data = prepare_ml_data()
print("Columns in ml_data:", ml_data.columns.tolist())

# Train RandomForest model
if not ml_data.empty:
    X = ml_data[["current_level", "years_of_experience"]]
    y = ml_data["desired_level"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    print(f"Model accuracy: {model.score(X_test, y_test):.2f}")
else:
    print("No data to train ML model; proceeding with rule-based gaps")

# Predict skill gaps
def calculate_skill_gap_ml(row, skill):
    current_level = row[skill] if skill in row and pd.notna(row[skill]) else None
    if current_level is None:
        return None
    current_level_int = skill_levels.get(current_level, 1)
    features = pd.DataFrame([[current_level_int, row["years_of_experience"]]], 
                            columns=["current_level", "years_of_experience"])
    desired_level_int = model.predict(features)[0] if 'model' in globals() else current_level_int + 1
    if desired_level_int > current_level_int:
        return {"skill": skill, "current_level": current_level, "desired_level": inverse_skill_levels[desired_level_int]}
    return None

# NLP for course recommendation
nlp = pipeline("feature-extraction", model="distilbert-base-uncased", framework="pt")
print("NLP pipeline loaded")

def get_embedding(text):
    with torch.no_grad():
        embedding = nlp(text)[0]
    return np.mean(embedding, axis=0)

# Use 'Title' column for embeddings
learning_content["embedding"] = learning_content["Title"].apply(get_embedding)
print("Course embeddings computed")

def recommend_course_nlp(skill, desired_level, delivery_mode=None):
    skill_embedding = get_embedding(skill)
    similarities = learning_content["embedding"].apply(
        lambda x: np.dot(x, skill_embedding) / (np.linalg.norm(x) * np.linalg.norm(skill_embedding))
    )
    candidates = learning_content.copy()
    candidates["similarity"] = similarities
    candidates = candidates[candidates["Level"] == desired_level]
    if delivery_mode:
        candidates = candidates[candidates["Format"] == delivery_mode]
    if not candidates.empty:
        best_match = candidates.loc[candidates["similarity"].idxmax()]
        return best_match["Title"]
    return "No suitable course found"

# Identify skill gaps and create learning paths
def create_learning_path(row):
    associate_id = row["associate_id"]
    learning_path = []

    # Get associate's skills
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}

    # Get preferred format from learning_preferences.csv
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None

    # 1. Project-based skill gaps
    assoc_project_id = row["project_id"]
    assoc_projects = projects[projects["project_id"] == assoc_project_id]
    
    for _, proj in assoc_projects.iterrows():
        for req_skill in skill_columns_proj:
            req_level = proj[req_skill]
            if pd.notna(req_level):
                req_level_int = skill_levels.get(req_level, 1)
                current_level = assoc_skills.get(req_skill, None)
                current_level_int = skill_levels.get(current_level, 1) if pd.notna(current_level) else 1
                
                if current_level_int < req_level_int:
                    gap = {"skill": req_skill, "current_level": current_level if pd.notna(current_level) else "Not Known", 
                           "desired_level": req_level}
                    course = recommend_course_nlp(req_skill, req_level, delivery_mode)
                    learning_path.append({"type": "project", "skill": req_skill, "course": course})

    # 2. Manager and department needs (from associates_with_grouped_skills.csv)
    # Parse manager-recommended skills
    manager_skills = row["skills_recommended_by_manager"]
    if pd.notna(manager_skills):
        for skill in manager_skills.split(","):
            skill = skill.strip()
            current_level = assoc_skills.get(skill, None)
            current_level_int = skill_levels.get(current_level, 1) if pd.notna(current_level) else 1
            desired_level_int = current_level_int + 1 if current_level_int < 4 else 4  # Aim for the next level
            desired_level = inverse_skill_levels[desired_level_int]
            
            if current_level_int < desired_level_int:
                course = recommend_course_nlp(skill, desired_level, delivery_mode)
                learning_path.append({"type": "manager", "skill": skill, "course": course})

    # Parse department-recommended skills
    dept_skills = row["skills_recommended_by_department"]
    if pd.notna(dept_skills):
        for skill in dept_skills.split(","):
            skill = skill.strip()
            current_level = assoc_skills.get(skill, None)
            current_level_int = skill_levels.get(current_level, 1) if pd.notna(current_level) else 1
            desired_level_int = current_level_int + 1 if current_level_int < 4 else 4
            desired_level = inverse_skill_levels[desired_level_int]
            
            if current_level_int < desired_level_int:
                course = recommend_course_nlp(skill, desired_level, delivery_mode)
                learning_path.append({"type": "department", "skill": skill, "course": course})

    # 3. Skill progression
    for skill in assoc_skills:
        if pd.notna(assoc_skills[skill]):
            gap = calculate_skill_gap_ml(row, skill)
            if gap:
                course = recommend_course_nlp(gap["skill"], gap["desired_level"], delivery_mode)
                learning_path.append({"type": "progression", "skill": gap["skill"], "course": course})

    return learning_path

associates["learning_path"] = associates.apply(create_learning_path, axis=1)
print("Learning paths assigned")

# Clustering for personalization
skill_matrix = associates[skill_columns_assoc].map(lambda x: skill_levels.get(x, 0))
kmeans = KMeans(n_clusters=3, random_state=42)
associates["cluster"] = kmeans.fit_predict(skill_matrix)
print("Clusters assigned")

# Save results
filtered_associates = associates[["associate_id", "role", "learning_path", "cluster"]]
filtered_associates.to_csv("learning_pathways.csv", index=False)

print("✅ AI-enhanced learning pathways saved successfully!")

Columns in associates_with_grouped_skills.csv: ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id', 'certifications_obtained', 'certifications_in_progress', 'available_learning_time', 'learning_time_utilized_previous_year', 'System Design', 'Cloud Architecture', 'Microservices Architecture', 'Performance Optimization', 'Scalability Design', 'CI/CD Pipeline Design', 'Event-Driven Architecture', 'CQRS Pattern', 'Hexagonal Architecture', 'Domain-Driven Design', 'Service Mesh', 'API Gateway Patterns', 'EDA Patterns', 'Saga Pattern', 'Circuit Breaker', 'BFF Pattern', 'Strangler Pattern', 'Anti-Corruption Layer', 'Modular Monolith', 'Quantum Computing Architecture', 'AI System Design', 'Blockchain Architecture', 'Edge Computing Architecture', 'Data Mesh Design', 'Digital Twin Architecture', 'Compliance by Design', 'Sustainability Architecture', 'Chaos Engineering Design', 'Cognitive Architecture', 'Git', 'Agile Methodologies', 'Problem Solving', 'Communication', 'Te

In [18]:
print("Columns in learning_preferences.csv:", preferences.columns.tolist())

Columns in learning_preferences.csv: ['preference_id', 'associate_id', 'preferred_format', 'weekly_hours', 'preferred_domain']
