<a href="https://colab.research.google.com/github/bgsw404notfound/SkiSphe/blob/main/NLP_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, logging
import torch
import os
from multiprocessing import Pool

# Suppress transformer logs and disable tokenizers parallelism
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load datasets
associates = pd.read_csv("/content/drive/MyDrive/SkillSphe/associates_with_grouped_skills.csv")
projects = pd.read_csv("/content/drive/MyDrive/SkillSphe/projects_with_grouped_skills.csv")
learning_content = pd.read_csv("/content/drive/MyDrive/SkillSphe/learning_content.csv")
preferences = pd.read_csv("/content/drive/MyDrive/SkillSphe/learning_preferences.csv")

# Convert project_id to string
associates["project_id"] = associates["project_id"].astype(str)
projects["project_id"] = projects["project_id"].astype(str)

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id',
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time',
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager',
                         'skills_recommended_by_department', 'skills_selected_for_own', 'L1', 'L2', 'L3', 'L4']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# NLP setup
nlp = pipeline("feature-extraction", model="distilbert-base-uncased", framework="pt")
print("NLP pipeline loaded")

def get_embedding(text):
    with torch.no_grad():
        embedding = nlp(text)[0]
    return np.mean(embedding, axis=0)

# Precompute course embeddings
print("Precomputing course embeddings...")
learning_content["embedding"] = learning_content["Title"].apply(get_embedding)

# Batch recommendation function
def batch_recommend_courses(skills_levels, delivery_mode=None):
    skill_embeddings = {skill: get_embedding(skill) for skill, _ in skills_levels if skill}
    results = {}
    for skill, desired_level in skills_levels:
        if not skill:
            results[(skill, desired_level)] = "No suitable course found"
            continue
        similarities = learning_content["embedding"].apply(
            lambda x: np.dot(x, skill_embeddings[skill]) / (np.linalg.norm(x) * np.linalg.norm(skill_embeddings[skill]))
        )
        candidates = learning_content[learning_content["Level"] == desired_level].copy()
        if delivery_mode:
            candidates = candidates[candidates["Format"] == delivery_mode]
        candidates["similarity"] = similarities[candidates.index]
        skill_lower = skill.lower()
        candidates["keyword_match"] = candidates["Title"].str.lower().str.contains(skill_lower).astype(int)
        candidates = candidates.sort_values(by=["keyword_match", "similarity"], ascending=[False, False])
        results[(skill, desired_level)] = candidates.iloc[0]["Title"] if not candidates.empty else "No suitable course found"
    return results

# Function to convert 'L<number>' to integer
def level_to_int(value):
    if isinstance(value, str) and value.startswith('L'):
        try:
            return int(value[1:])
        except:
            return 0
    try:
        return int(value)
    except:
        return 0

# Preprocess recommended skills
def preprocess_recommended_skills(df, columns):
    for col in columns:
        df[col + "_dict"] = df[col].fillna("").str.split(",").apply(
            lambda skills: {s.split("-")[0].strip(): level_to_int(s.split("-")[1].strip())
                            for s in skills if "-" in s and len(s.split("-")) == 2}
        )
    return df

recommended_cols = ['skills_recommended_by_manager', 'skills_recommended_by_department', 'skills_selected_for_own']
associates = preprocess_recommended_skills(associates, recommended_cols)

# Optimized learning path function
def create_learning_path_and_trainings(row):
    associate_id = row["associate_id"]
    project_recommendations = []
    project_skill_gaps = []
    training_recommendations = []
    training_skill_gaps = []
    current_levels = []
    recommended_levels = []
    current_project_levels = []
    recommended_project_levels = []

    # Project-Based Learning Paths
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None
    assoc_project_ids = [pid.strip() for pid in row["project_id"].split(",")]
    assoc_projects = projects[projects["project_id"].isin(assoc_project_ids)]

    if not assoc_projects.empty:
        max_required_levels = {}
        for skill in skill_columns_proj:
            levels = assoc_projects[skill]
            valid_levels = [skill_levels.get(level, 0) for level in levels if pd.notna(level) and level in skill_levels]
            if valid_levels:
                max_required_levels[skill] = {
                    "level": inverse_skill_levels[max(valid_levels)],
                    "level_int": max(valid_levels)
                }

        project_skills_levels = [
            (req_skill, req_info["level"])
            for req_skill, req_info in max_required_levels.items()
            if skill_levels.get(assoc_skills.get(req_skill, "L0"), 0) < req_info["level_int"]
        ]
        if project_skills_levels:
            batch_results = batch_recommend_courses(project_skills_levels, delivery_mode)
            for (skill, level), course in batch_results.items():
                current_level_int = skill_levels.get(assoc_skills.get(skill, "L0"), 0)
                req_level_int = skill_levels.get(level, 0)
                project_recommendations.append({"skill": skill, "course": course})
                project_skill_gaps.append({"skill": skill, "gap": f"L{current_level_int} → L{req_level_int}"})
                current_project_levels.append(f"{skill} - L{current_level_int}")
                recommended_project_levels.append(f"{skill} - L{req_level_int}")

    # Recommended Skills Training
    all_recommended = {}
    for col in [col + "_dict" for col in recommended_cols]:
        rec_skills = row[col]
        for skill, level in rec_skills.items():
            all_recommended[skill] = max(level, all_recommended.get(skill, 0))

    training_skills_levels = [
        (skill, inverse_skill_levels[rec_level])
        for skill, rec_level in sorted(all_recommended.items())
        if rec_level > level_to_int(assoc_skills.get(skill, 0))
    ]
    if training_skills_levels:
        batch_results = batch_recommend_courses(training_skills_levels, delivery_mode)
        for (skill, level), course in batch_results.items():
            current_level = level_to_int(assoc_skills.get(skill, 0))
            rec_level = skill_levels.get(level, 0)
            training_recommendations.append({"skill": skill, "course": course})
            training_skill_gaps.append(f"{skill} (L{current_level} → L{rec_level})")
            current_levels.append(f"{skill} - L{current_level}")
            recommended_levels.append(f"{skill} - L{rec_level}")

    # Format project outputs
    project_skill_gaps_str = ", ".join(f"{gap['skill']} ({gap['gap']})" for gap in project_skill_gaps)
    trainings_by_project = ", ".join(f"{skill}: {course}" for skill, course in
                                     [(rec["skill"], rec["course"]) for rec in project_recommendations])

    # Format training outputs
    trainings_by_manager_dept_self = ", ".join(f"{skill}: {course}" for skill, course in
                                               [(rec["skill"], rec["course"]) for rec in training_recommendations])

    # Collaborative Training Recommendation
    role = row["role"]
    same_role_associates = associates[associates["role"] == role]
    collaborative_gaps = []
    for skill in skill_columns_assoc:
        peer_levels = same_role_associates[skill].apply(level_to_int)
        max_level = peer_levels.max()
        if max_level > 0:
            current_level = level_to_int(assoc_skills.get(skill, 0))
            if current_level < max_level:
                collaborative_gaps.append((skill, max_level, max_level - current_level))

    collaborative_gaps.sort(key=lambda x: x[2], reverse=True)
    top_gaps = collaborative_gaps[:3]
    collab_skills_levels = [(skill, inverse_skill_levels[target_level]) for skill, target_level, _ in top_gaps]
    collaborative_trainings_str = ""
    if collab_skills_levels:
        batch_results = batch_recommend_courses(collab_skills_levels, delivery_mode)
        collaborative_trainings_str = ", ".join(f"{skill}: {course}" for (skill, _), course in batch_results.items())

    return pd.Series({
        "project_recommendations": project_recommendations,
        "project_skill_gaps": project_skill_gaps_str,
        "training_recommendations": training_recommendations,
        "skill_gaps_basedonrecommendation": ", ".join(training_skill_gaps),
        "current_skill_levels": ", ".join(current_levels),
        "recommended_skill_levels": ", ".join(recommended_levels),
        "current_skill_levels_forproject": ", ".join(current_project_levels),
        "recommended_skill_levels_forproject": ", ".join(recommended_project_levels),
        "trainings_recommended_by_project": trainings_by_project,
        "trainings_recommended_by_manager_department_self": trainings_by_manager_dept_self,
        "collaborative_training_recommendation": collaborative_trainings_str
    })

# Parallel execution
def parallel_apply(df, func, num_processes=4):
    with Pool(num_processes) as pool:
        results = pool.map(func, [row for _, row in df.iterrows()])
    return pd.DataFrame(results)

print("Processing learning paths in parallel...")
results = parallel_apply(associates, create_learning_path_and_trainings)
associates = pd.concat([associates, results], axis=1)

# Define output columns
output_columns = [
    "associate_id", "name", "role", "project_id",
    "current_skill_levels", "recommended_skill_levels",
    "current_skill_levels_forproject", "recommended_skill_levels_forproject",
    "project_skill_gaps", "skill_gaps_basedonrecommendation",
    "trainings_recommended_by_project", "trainings_recommended_by_manager_department_self",
    "collaborative_training_recommendation"
]

# Save results
associates[output_columns].to_csv("combined_learning_pathways.csv", index=False)

print("✅ Combined project-based learning pathways and training assignments saved successfully!")
print(associates[output_columns].head(10))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

NLP pipeline loaded
Precomputing course embeddings...
Processing learning paths in parallel...
