<a href="https://colab.research.google.com/github/bgsw404notfound/SkiSphe/blob/main/TF_Vectorbased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import requests
import os

# Load datasets
associates = pd.read_csv("/content/drive/MyDrive/SkillSphe/associates_with_grouped_skills.csv")
projects = pd.read_csv("/content/drive/MyDrive/SkillSphe/projects_with_grouped_skills.csv")
learning_content = pd.read_csv("/content/drive/MyDrive/SkillSphe/learning_content.csv")
preferences = pd.read_csv("/content/drive/MyDrive/SkillSphe/learning_preferences.csv")

# Convert project_id to string in both DataFrames to ensure consistent matching
associates["project_id"] = associates["project_id"].astype(str)
projects["project_id"] = projects["project_id"].astype(str)

# Define skill levels
skill_levels = {"L1": 1, "L2": 2, "L3": 3, "L4": 4}
inverse_skill_levels = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

# Identify skill columns
metadata_columns_assoc = ['associate_id', 'name', 'role', 'years_of_experience', 'grade', 'project_id',
                         'certifications_obtained', 'certifications_in_progress', 'available_learning_time',
                         'learning_time_utilized_previous_year', 'skills_recommended_by_manager',
                         'skills_recommended_by_department', 'skills_selected_for_own', 'L1', 'L2', 'L3', 'L4']
skill_columns_assoc = [col for col in associates.columns if col not in metadata_columns_assoc]

metadata_columns_proj = ['project_id', 'project_name', 'Project_status']
skill_columns_proj = [col for col in projects.columns if col not in metadata_columns_proj]

# Precompute TF-IDF vectors for local course titles
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(learning_content["Title"])
print("TF-IDF matrix computed")

# Function to search external MOOC platforms (simulated with a web query)
def search_external_mooc(skill, desired_level):
    # Simulated search (replace with real API in production)
    # Using a hypothetical free course aggregator API or web search as a placeholder
    query = f"{skill} {desired_level} course site:coursera.org OR site:edx.org OR site:udemy.com"
    try:
        # Placeholder for a real API call or web search
        # Example: Google Custom Search API or Class Central API would go here
        # For now, simulate a response
        response = requests.get(f"https://api.classcentral.com/search?q={query}", timeout=5)  # Hypothetical endpoint
        if response.status_code == 200:
            data = response.json()
            if data.get("results"):
                course = data["results"][0]["title"]
                platform = data["results"][0].get("platform", "External Platform")
                return f"{course} (via {platform})"
    except Exception as e:
        print(f"External search failed for {skill}: {e}")

    # Fallback response if no real API is available
    return f"{skill} {desired_level} Course (via External Search)"

# AI-based course recommendation with fallback to external MOOC platforms
def recommend_course_tfidf(skill, desired_level, delivery_mode=None):
    # Try local dataset first
    skill_vector = tfidf_vectorizer.transform([skill])
    similarities = cosine_similarity(skill_vector, tfidf_matrix).flatten()

    candidates = learning_content.copy()
    candidates["similarity"] = similarities
    candidates = candidates[candidates["Level"] == desired_level]
    if delivery_mode:
        candidates = candidates[candidates["Format"] == delivery_mode]

    candidates = candidates.sort_values(by="similarity", ascending=False)
    if not candidates.empty:
        return candidates.iloc[0]["Title"]

    # Fallback to external MOOC search
    return search_external_mooc(skill, desired_level)

# Convert skill levels to integers
def level_to_int(value):
    if isinstance(value, str) and value.startswith('L'):
        try:
            return int(value[1:])
        except:
            return 0
    try:
        return int(value)
    except:
        return 0

# Parse recommended skills
def parse_recommended_skills(skills_str):
    skills_dict = {}
    if pd.isna(skills_str):
        return skills_dict
    for item in skills_str.split(','):
        item = item.strip()
        if '-' in item:
            parts = item.split('-')
            if len(parts) == 2:
                skill = parts[0].strip()
                level = level_to_int(parts[1].strip())
                if skill:
                    skills_dict[skill] = max(level, skills_dict.get(skill, 0))
    return skills_dict

# Cluster associates based on skill profiles
skill_matrix = associates[skill_columns_assoc].applymap(level_to_int).fillna(0)
kmeans = KMeans(n_clusters=min(5, len(associates)), random_state=42)
associates["cluster"] = kmeans.fit_predict(skill_matrix)
print("Associates clustered based on skill profiles")

# Combined function with AI-based collaborative recommendations
def create_learning_path_and_trainings(row):
    associate_id = row["associate_id"]
    cluster = row["cluster"]
    project_recommendations = []
    project_skill_gaps = []
    training_recommendations = []
    training_skill_gaps = []
    current_levels = []
    recommended_levels = []
    current_project_levels = []
    recommended_project_levels = []
    collaborative_recommendations = []

    # Project-Based Learning Paths
    assoc_skills = {skill: row[skill] for skill in skill_columns_assoc}
    assoc_prefs = preferences[preferences["associate_id"] == associate_id]
    delivery_mode = assoc_prefs["preferred_format"].iloc[0] if not assoc_prefs.empty else None
    assoc_project_ids = [pid.strip() for pid in row["project_id"].split(",")]
    assoc_projects = projects[projects["project_id"].isin(assoc_project_ids)]

    if assoc_projects.empty:
        print(f"No projects found for associate {associate_id} with project_ids {assoc_project_ids}")
    else:
        max_required_levels = {}
        for skill in skill_columns_proj:
            levels = assoc_projects[skill]
            valid_levels = [skill_levels.get(level, 0) for level in levels if pd.notna(level) and level in skill_levels]
            if valid_levels:
                max_required_levels[skill] = {
                    "level": inverse_skill_levels[max(valid_levels)],
                    "level_int": max(valid_levels)
                }

        for req_skill, req_info in max_required_levels.items():
            req_level = req_info["level"]
            req_level_int = req_info["level_int"]
            current_level = assoc_skills.get(req_skill, None)
            if pd.isna(current_level):
                continue
            current_level_int = skill_levels.get(current_level, 0) if current_level in skill_levels else 0

            if current_level_int < req_level_int:
                course = recommend_course_tfidf(req_skill, req_level, delivery_mode)
                project_recommendations.append({"skill": req_skill, "course": course})
                project_skill_gaps.append({"skill": req_skill, "gap": f"L{current_level_int} → L{req_level_int}"})
                current_project_levels.append(f"{req_skill} - L{current_level_int}")
                recommended_project_levels.append(f"{req_skill} - L{req_level_int}")

    # Recommended Skills Training
    all_recommended = {}
    for col in ['skills_recommended_by_manager', 'skills_recommended_by_department', 'skills_selected_for_own']:
        rec_skills = parse_recommended_skills(row.get(col, ''))
        for skill, level in rec_skills.items():
            all_recommended[skill] = max(level, all_recommended.get(skill, 0))

    for skill, rec_level in sorted(all_recommended.items()):
        current_level = level_to_int(row.get(skill, 0))
        if rec_level > current_level:
            course = recommend_course_tfidf(skill, inverse_skill_levels[rec_level], delivery_mode)
            training_recommendations.append({"skill": skill, "course": course})
            training_skill_gaps.append(f"{skill} (L{current_level} → L{rec_level})")
            current_levels.append(f"{skill} - L{current_level}")
            recommended_levels.append(f"{skill} - L{rec_level}")

    # AI-Based Collaborative Training Recommendations
    peers = associates[associates["cluster"] == cluster]
    skill_gaps_collaborative = {}
    for skill in skill_columns_assoc:
        peer_levels = peers[skill].apply(level_to_int)
        centroid_level = int(kmeans.cluster_centers_[cluster][skill_columns_assoc.index(skill)])
        current_level = level_to_int(row.get(skill, 0))
        if centroid_level > current_level:
            gap = centroid_level - current_level
            skill_gaps_collaborative[skill] = {"gap": gap, "desired_level": inverse_skill_levels[centroid_level]}

    sorted_gaps = sorted(skill_gaps_collaborative.items(), key=lambda x: x[1]["gap"], reverse=True)[:3]
    for skill, info in sorted_gaps:
        course = recommend_course_tfidf(skill, info["desired_level"], delivery_mode)
        collaborative_recommendations.append(f"{skill}: {course}")

    # Format outputs
    project_skill_gaps_str = ", ".join(f"{gap['skill']} ({gap['gap']})" for gap in project_skill_gaps)
    trainings_by_project = ", ".join(f"{item['skill']}: {item['course']}" for item in project_recommendations)
    trainings_by_manager_dept_self = ", ".join(f"{item['skill']}: {item['course']}" for item in training_recommendations)
    collaborative_recommendations_str = ", ".join(collaborative_recommendations)

    return pd.Series({
        "project_recommendations": project_recommendations,
        "project_skill_gaps": project_skill_gaps_str if project_skill_gaps else "",
        "training_recommendations": training_recommendations,
        "skill_gaps_basedonrecommendation": ", ".join(training_skill_gaps),
        "current_skill_levels": ", ".join(current_levels),
        "recommended_skill_levels": ", ".join(recommended_levels),
        "current_skill_levels_forproject": ", ".join(current_project_levels) if current_project_levels else "",
        "recommended_skill_levels_forproject": ", ".join(recommended_project_levels) if recommended_project_levels else "",
        "trainings_recommended_by_project": trainings_by_project,
        "trainings_recommended_by_manager_department_self": trainings_by_manager_dept_self,
        "collaborative_training_recommendations": collaborative_recommendations_str
    })

# Apply the combined function
results = associates.apply(create_learning_path_and_trainings, axis=1)
associates = pd.concat([associates, results], axis=1)

# Define the desired column order
output_columns = [
    "associate_id",
    "role",
    "project_skill_gaps",
    "current_skill_levels_forproject",
    "recommended_skill_levels_forproject",
    "skill_gaps_basedonrecommendation",
    "current_skill_levels",
    "recommended_skill_levels",
    "trainings_recommended_by_project",
    "trainings_recommended_by_manager_department_self",
    "collaborative_training_recommendations"
]

# Save results
associates[output_columns].to_csv("combined_learning_pathways.csv", index=False)

print("✅ Combined project-based learning pathways, training assignments, and AI-based collaborative recommendations saved successfully!")
print(associates[output_columns].head(10))

TF-IDF matrix computed


  skill_matrix = associates[skill_columns_assoc].applymap(level_to_int).fillna(0)


Associates clustered based on skill profiles
External search failed for JUnit: HTTPSConnectionPool(host='api.classcentral.com', port=443): Max retries exceeded with url: /search?q=JUnit%20L3%20course%20site:coursera.org%20OR%20site:edx.org%20OR%20site:udemy.com (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x78e9a01abd10>: Failed to resolve 'api.classcentral.com' ([Errno -2] Name or service not known)"))
External search failed for Spring Boot: HTTPSConnectionPool(host='api.classcentral.com', port=443): Max retries exceeded with url: /search?q=Spring%20Boot%20L4%20course%20site:coursera.org%20OR%20site:edx.org%20OR%20site:udemy.com (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x78e99f17fe50>: Failed to resolve 'api.classcentral.com' ([Errno -2] Name or service not known)"))
External search failed for Computer Vision: HTTPSConnectionPool(host='api.classcentral.com', port=443): Max retries exceeded with url: /search?q=Comput