<a href="https://colab.research.google.com/github/bgsw404notfound/SkiSphe/blob/main/SkillSphere.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker
fake = Faker()
Faker.seed(42)

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Configuration
num_associates = 500
num_projects = 500
num_skills = 500
num_learning_goals = 20
num_external_content = 100

# Skill rating levels
SKILL_LEVELS = {
    1: "L1-Beginner",
    2: "L2-Dependant",
    3: "L3-Independant",
    4: "L4-Expert"
}

# ======================
# ROLE-SPECIFIC EXPERIENCE RANGES
# ======================
role_experience_ranges = {
    'Software Developer': (0, 8),
    'Software Tester': (0, 8),
    'Security Engineer': (0, 8),
    'DevOps Engineer': (0, 8),
    'Data Scientist': (3, 10),
    'Cloud Architect': (10, 20),
    'Software Architect': (10, 20),
    'Project Manager': (15, 25),
    'Tech Lead': (8, 15),
    'Module Lead': (5, 12),
    'Group Manager': (10, 20),
    'Database Administrator': (3, 15),
    'UX Designer': (2, 12)
}

# ======================
# ROLE-SPECIFIC SKILLS
# ======================
role_skills = {
    'Software Developer': [
        'Python', 'Java', 'JavaScript', 'TypeScript', 'Spring Boot',
        'React', 'Angular', 'REST API', 'Microservices', 'Unit Testing',
        'Kotlin', 'Swift', 'Flutter', 'Dart', 'GraphQL',
        'WebAssembly', 'RxJS', 'Next.js', 'NestJS', 'Jest',
        'Deno', 'Rust', 'Web3', 'Blockchain', 'Smart Contracts',
        'Three.js', 'Web Components', 'Electron', 'JNI', 'FFI'
    ],

    'Software Tester': [
        'Selenium', 'Cypress', 'JMeter', 'Load Testing', 'Security Testing',
        'Test Automation', 'BDD', 'Cucumber', 'Postman', 'SoapUI',
        'Playwright', 'Karate', 'Robot Framework', 'TestNG', 'JUnit',
        'Allure Reports', 'Gatling', 'BrowserStack', 'Sauce Labs', 'PACT',
        'AI Testing', 'Visual Testing', 'Mutation Testing', 'Contract Testing',
        'Accessibility Testing', 'Chaos Testing', 'Performance Engineering',
        'Test Data Management', 'Service Virtualization', 'Test Observability'
    ],

    'Software Architect': [
        'System Design', 'Cloud Architecture', 'Microservices Architecture',
        'Performance Optimization', 'Scalability Design', 'CI/CD Pipeline Design',
        'Event-Driven Architecture', 'CQRS Pattern', 'Hexagonal Architecture',
        'Domain-Driven Design', 'Service Mesh', 'API Gateway Patterns',
        'EDA Patterns', 'Saga Pattern', 'Circuit Breaker', 'BFF Pattern',
        'Strangler Pattern', 'Anti-Corruption Layer', 'Modular Monolith',
        'Quantum Computing Architecture', 'AI System Design', 'Blockchain Architecture',
        'Edge Computing Architecture', 'Data Mesh Design', 'Digital Twin Architecture',
        'Compliance by Design', 'Sustainability Architecture', 'Chaos Engineering Design',
        'Cognitive Architecture'
    ],

    'Project Manager': [
        'Project Planning', 'Risk Management', 'Resource Allocation',
        'Stakeholder Management', 'Agile Methodologies', 'Scrum',
        'Budget Forecasting', 'JIRA Administration', 'Confluence',
        'Power BI Reporting', 'Stakeholder Mapping', 'Value Stream Mapping',
        'Benefits Realization', 'Portfolio Management', 'Program Management',
        'Agile Transformation', 'Change Management', 'Business Case Development',
        'Digital Transformation', 'AI Project Management', 'Blockchain Project Governance',
        'Remote Team Leadership', 'Crisis Management', 'Merger & Acquisition Integration',
        'Sustainability Project Management', 'Regulatory Compliance Management',
        'Innovation Management', 'Future Trends Analysis'
    ],

    'Data Scientist': [
        'Python', 'Machine Learning', 'TensorFlow', 'PyTorch', 'Data Visualization',
        'Big Data', 'Spark', 'SQL', 'Statistical Analysis',
        'LangChain', 'LLM Fine-Tuning', 'Hugging Face Transformers',
        'Computer Vision', 'Generative AI', 'Prompt Engineering',
        'Feature Engineering', 'Model Deployment', 'MLOps', 'Data Pipelines',
        'Quantum Machine Learning', 'Federated Learning', 'Causal Inference',
        'Graph Machine Learning', 'Time Series Forecasting', 'Anomaly Detection',
        'Reinforcement Learning', 'AI Ethics', 'Explainable AI', 'AI Governance'
    ],

    'DevOps Engineer': [
        'Docker', 'Kubernetes', 'Terraform', 'Jenkins', 'GitLab CI/CD',
        'Prometheus', 'Grafana', 'Linux Administration', 'Cloud Infrastructure',
        'ArgoCD', 'Crossplane', 'Tekton', 'Spinnaker',
        'Chaos Engineering', 'GitHub Actions', 'Observability',
        'Infrastructure as Code', 'Policy as Code', 'GitOps',
        'Platform Engineering', 'Internal Developer Platforms', 'Service Mesh',
        'Edge Computing', 'FinOps', 'Cloud Cost Optimization', 'SBOM Management',
        'Software Supply Chain Security', 'MLOps Pipelines', 'AI Infrastructure'
    ],

    'Cloud Architect': [
        'AWS', 'Azure', 'Google Cloud', 'Terraform', 'Kubernetes',
        'Microservices', 'Serverless', 'Cloud Security', 'Networking',
        'FinOps', 'Cloud Cost Optimization', 'Cloud Migration',
        'Edge Computing', 'Multi-Cloud Strategy', 'Cloud-Native Design',
        'Cloud Governance', 'Cloud Compliance', 'Cloud Automation',
        'AI Cloud Architecture', 'Quantum Cloud Computing', 'Sustainable Cloud',
        'Cloud Carbon Footprint Reduction', 'Cloud Digital Twins', 'Cloud Metaverse',
        'Cloud Robotics', 'Cloud IoT Architecture', 'Cloud Blockchain', 'Cloud AR/VR'
    ],

    'Security Engineer': [
        'Penetration Testing', 'SIEM', 'Firewalls', 'Cryptography',
        'GDPR Compliance', 'Vulnerability Assessment', 'SOC 2',
        'Threat Modeling', 'Zero Trust Architecture', 'MITRE ATT&CK',
        'Cloud Security Posture Management', 'Secure Code Review',
        'Container Security', 'API Security', 'Identity Management',
        'AI Security', 'Quantum Cryptography', 'Blockchain Security',
        'IoT Security', '5G Security', 'Biometric Security',
        'Security Automation', 'Threat Intelligence', 'Red Teaming', 'Blue Teaming'
    ],

    'Tech Lead': [
        'Technical Leadership', 'Code Reviews', 'Mentorship',
        'System Architecture', 'Team Management', 'Agile Coaching',
        'Architectural Decision Records', 'Tech Radar Creation',
        'Engineering Metrics', 'Developer Productivity', 'Backlog Refinement',
        'Technical Debt Management', 'Incident Management', 'Post-Mortems',
        'AI Engineering Leadership', 'Platform Engineering Leadership',
        'Developer Experience', 'InnerSource Leadership', 'Open Source Strategy',
        'Technical Evangelism', 'Future Tech Adoption', 'Skills Gap Analysis',
        'Technical Career Pathing', 'Engineering Culture Development'
    ],

    'Database Administrator': [
        'SQL', 'NoSQL', 'Database Design', 'Query Optimization',
        'Backup & Recovery', 'Data Modeling', 'ETL Processes',
        'Columnar Databases', 'Vector Databases', 'Time-Series DBs',
        'Database Sharding', 'Change Data Capture', 'Database Federation',
        'Database Security', 'Database Performance Tuning', 'Data Warehousing',
        'AI Database Optimization', 'Blockchain Databases', 'Graph Database Analytics',
        'Database DevOps', 'Database as Code', 'Database Observability',
        'Database Migration', 'Multi-Model Databases', 'NewSQL', 'Database Quantum Computing'
    ],

    'UX Designer': [
        'User Research', 'Wireframing', 'Prototyping', 'UI Design',
        'Usability Testing', 'Figma', 'Adobe XD', 'User Flows',
        'Design Systems', 'Motion Design', 'Voice UI Design',
        'Augmented Reality Interfaces', 'Accessibility Design',
        'UX Writing', 'UX Metrics', 'UX Strategy', 'Service Design',
        'AI UX Design', 'Quantum UX', 'Metaverse Design',
        'Neurodesign', 'Emotional Design', 'Sustainable UX',
        'Inclusive Design', 'Cross-Cultural UX', 'UX for IoT', 'UX for Blockchain'
    ]
}

# Common skills for all roles
common_skills = [
    'Git', 'Agile Methodologies', 'Problem Solving', 'Communication',
    'Technical Documentation', 'Debugging', 'Code Review',
    'Pair Programming', 'Mentoring', 'Stakeholder Communication',
    'Time Management', 'Critical Thinking', 'Continuous Learning',
    'Remote Collaboration', 'Digital Literacy', 'Emotional Intelligence',
    'Conflict Resolution', 'Adaptability', 'Creativity',
    'Data Literacy', 'Visual Thinking', 'Systems Thinking', 'Future Thinking'
]

# Grade definitions
grades = ['Junior', 'Mid-level', 'Senior', 'Lead', 'Principal']
grade_experience_ranges = {
    'Junior': (0, 3),
    'Mid-level': (4, 7),
    'Senior': (8, 12),
    'Lead': (13, 15),
    'Principal': (16, 25)
}

# ======================
# DATA GENERATION
# ======================

# Define all_skills by combining all role-specific skills and common skills
all_skills = list(set([skill for skills in role_skills.values() for skill in skills] + common_skills))

# Generate roles with weighted distribution
roles = list(role_experience_ranges.keys())
role_weights = [10, 8, 5, 3, 7, 4, 4, 3, 5, 4, 2, 5, 3]

# Generate associate profiles
associate_profiles = pd.DataFrame({
    'associate_id': range(1, num_associates + 1),
    'name': [fake.name() for _ in range(num_associates)],
    'role': random.choices(roles, weights=role_weights, k=num_associates)
})

# Assign years of experience based on role
def assign_experience(role):
    min_exp, max_exp = role_experience_ranges[role]
    return random.randint(min_exp, max_exp)

associate_profiles['years_of_experience'] = associate_profiles['role'].apply(assign_experience)

# Assign grades
associate_profiles['grade'] = associate_profiles['years_of_experience'].apply(
    lambda x: next(grade for grade, (min_exp, max_exp) in grade_experience_ranges.items()
              if min_exp <= x <= max_exp)
)

# Function to generate skill ratings based on experience and role using L1-L4 levels
def generate_skill_ratings(role, years_exp):
    role_skills_list = role_skills.get(role, []) + common_skills
    skill_ratings = {}

    for skill in role_skills_list:
        # Base level (1-4)
        base_level = random.randint(1, 4)

        # Adjust based on experience
        exp_adjustment = min(years_exp // 3, 1)  # +0 to +1 based on experience
        final_level = min(base_level + exp_adjustment, 4)  # Cap at L4-Expert

        skill_ratings[skill] = SKILL_LEVELS[final_level]

    # Add some random skills from other domains (10% chance per skill)
    for skill in all_skills:
        if skill not in skill_ratings and random.random() < 0.1:
            skill_ratings[skill] = SKILL_LEVELS[random.randint(1, 2)]  # Lower level for non-core skills

    return skill_ratings

# Create a list of dictionaries for skill ratings
skill_ratings_list = []
for _, row in associate_profiles.iterrows():
    ratings = generate_skill_ratings(row['role'], row['years_of_experience'])
    skill_ratings_list.append(ratings)

# Convert to DataFrame and merge with associate profiles
skill_ratings_df = pd.DataFrame(skill_ratings_list).add_prefix('skill_')
associate_profiles = pd.concat([associate_profiles, skill_ratings_df], axis=1)

# Generate projects data with required_techstacks
projects = []

# We'll create 100 projects as requested
for project_id in range(1, 101):
    project_name = f'Project {project_id}'
    project_status = random.choice(['Current', 'Upcoming'])

    # Select 1-2 techstacks from each role's skills
    required_techstacks = []
    for role, skills in role_skills.items():
        required_techstacks.extend(random.sample(skills, random.randint(1, 2)))

    # Remove duplicates and randomize order
    required_techstacks = list(set(required_techstacks))
    random.shuffle(required_techstacks)

    # Select 3-5 common skills
    required_common_skills = random.sample(common_skills, random.randint(3, 5))

    projects.append({
        'project_id': project_id,
        'project_name': project_name,
        'Project_status': project_status,
        'required_techstacks': ', '.join(required_techstacks),
        'required_common_skills': ', '.join(required_common_skills)
    })

projects_df = pd.DataFrame(projects)



# Generate skill inventory
skill_inventory = []
for associate_id in range(1, num_associates + 1):
    num_skills = random.randint(5, 15)
    skills = random.sample(all_skills, num_skills)
    for skill in skills:
        skill_inventory.append({
            'associate_id': associate_id,
            'skill_name': skill,
            'self_rating': SKILL_LEVELS[random.randint(1, 4)],
            'last_used': random.randint(2018, 2023),
            'years_of_experience': random.randint(1, 5)
        })
skill_inventory = pd.DataFrame(skill_inventory)

# Generate learning preferences
learning_preferences = pd.DataFrame({
    'preference_id': range(1, num_associates + 1),
    'associate_id': range(1, num_associates + 1),
    'preferred_format': np.random.choice(['Video', 'Text', 'Interactive', 'Classroom'], num_associates),
    'weekly_hours': np.random.randint(1, 6, num_associates),
    'preferred_domain': [random.choice(list(role_skills.keys())) for _ in range(num_associates)]
})

# learning goals with new structure
learning_goals = []

for associate_id in range(1, num_associates + 1):
    # Get associate's current skills from skill_inventory
    current_skills = set(skill_inventory[skill_inventory['associate_id'] == associate_id]['skill_name'])

    # Get associate's role
    role = associate_profiles[associate_profiles['associate_id'] == associate_id]['role'].values[0]

    # Get all possible skills
    all_possible_skills = set(role_skills.get(role, []) + common_skills + [s for skills in role_skills.values() for s in skills])

    # Skills not currently assigned to the associate
    available_skills = list(all_possible_skills - current_skills)

    # If no available skills (unlikely but possible), skip
    if not available_skills:
        continue

    # Select 2-3 skills for each category
    manager_recommended = random.sample(available_skills, min(3, len(available_skills)))
    remaining_skills = list(set(available_skills) - set(manager_recommended))

    dept_recommended = random.sample(remaining_skills, min(3, len(remaining_skills))) if remaining_skills else []
    remaining_skills = list(set(remaining_skills) - set(dept_recommended))

    self_interest = random.sample(remaining_skills, min(3, len(remaining_skills))) if remaining_skills else []

    learning_goals.append({
        'associate_id': associate_id,
        'skills_recommended_by_manager': ', '.join(manager_recommended),
        'skills_recommended_by_department': ', '.join(dept_recommended) if dept_recommended else '',
        'self_interest_skills': ', '.join(self_interest) if self_interest else ''
    })

learning_goals_df = pd.DataFrame(learning_goals)

# Generate external content
external_content = pd.DataFrame({
    'content_id': range(1, num_external_content + 1),
    'title': [f'Course {i}' for i in range(1, num_external_content + 1)],
    'provider': np.random.choice(['Coursera', 'Udemy', 'edX', 'Pluralsight'], num_external_content),
    'duration_hours': np.random.randint(1, 30, num_external_content),
    'target_roles': [random.sample(roles, random.randint(1, 3)) for _ in range(num_external_content)]
})

# ======================
# ASSIGN 1-3 PROJECTS TO EACH ASSOCIATE
# ======================
project_ids = projects_df['project_id'].tolist()

# Assign 1-3 unique, sorted projects to every associate
associate_profiles['assigned_project'] = [
    ', '.join(map(str, sorted(random.sample(project_ids, k=random.randint(1, 3)))))
    for _ in range(num_associates)
]

# (Optional) Remove duplicates if project_ids had repeats
associate_profiles['assigned_project'] = associate_profiles['assigned_project'].apply(
    lambda x: ', '.join(sorted(set(x.split(', ')), key=int)) if x else x
)

# Save to CSV
associate_profiles.to_csv('associate_profiles.csv', index=False)
projects_df.to_csv('projects.csv', index=False)
skill_inventory.to_csv('skill_inventory.csv', index=False)
learning_preferences.to_csv('learning_preferences.csv', index=False)
learning_goals_df.to_csv('learning_goals.csv', index=False)
external_content.to_csv('external_content.csv', index=False)

print("Data generation complete with L1-L4 skill ratings for each associate!")

ModuleNotFoundError: No module named 'faker'