In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import uuid
import re

In [2]:
# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
fake = Faker()
Faker.seed(42)

In [3]:
# Increase number of records for a richer graph
num_profiles = 1000
num_jobs = 300

In [4]:
# Expanded list of tech companies with size categories
tech_companies = {
    "FAANG & Big Tech": [
        "Google", "Apple", "Meta", "Amazon", "Netflix", "Microsoft", "IBM", "Oracle", 
        "Intel", "NVIDIA", "Samsung", "Twitter", "LinkedIn", "Adobe", "Salesforce", "Cisco",
        "Tesla", "Uber", "Airbnb", "PayPal", "eBay", "Spotify", "Snap Inc.", "ByteDance",
        "Tencent", "Alibaba", "Baidu", "Sony", "Dell", "HP", "Lenovo", "Qualcomm", "AMD"
    ],
    "AI & ML Focused": [
        "OpenAI", "Anthropic", "DeepMind", "Hugging Face", "Scale AI", "Stability AI", 
        "Cohere", "Adept AI", "Inflection AI", "Runway ML", "Jasper AI", "DataRobot", 
        "Weights & Biases", "Databricks", "Anyscale", "H2O.ai", "SambaNova Systems"
    ],
    "Unicorn Startups": [
        "Stripe", "Instacart", "Canva", "Notion", "Figma", "Snowflake", "Databricks", 
        "Plaid", "Airtable", "Ripple", "Robinhood", "Discord", "Clubhouse", "Vercel", 
        "Replit", "Retool", "Miro", "Deel", "Gusto", "GitLab", "Calendly", "Grammarly"
    ],
    "Enterprise & Cloud": [
        "Snowflake", "MongoDB", "Confluent", "Elastic", "Splunk", "Palantir", "Twilio", 
        "Okta", "Cloudflare", "HashiCorp", "Datadog", "ServiceNow", "Workday", "Zendesk", 
        "HubSpot", "Atlassian", "Box", "Dropbox", "Slack", "Zoom", "DocuSign"
    ],
    "Emerging Tech": [
        "Neuralink", "SpaceX", "Blue Origin", "Waymo", "Cruise", "Nuro", "Rivian", 
        "Lucid Motors", "Boston Dynamics", "Anduril", "Samsara", "Impossible Foods", 
        "Climeworks", "Relativity Space", "Comma.ai", "Humane", "Cerebras Systems"
    ]
}

In [5]:
# Flatten company list while preserving category
all_companies = []
company_to_category = {}
for category, companies in tech_companies.items():
    for company in companies:
        all_companies.append(company)
        company_to_category[company] = category

In [6]:
# Tech job titles with seniority levels and departments
tech_job_titles = {
    "Engineering": [
        "Software Engineer", "Senior Software Engineer", "Staff Software Engineer", 
        "Principal Engineer", "Software Developer", "Frontend Developer", "Backend Developer", 
        "Full Stack Developer", "Machine Learning Engineer", "DevOps Engineer", 
        "Site Reliability Engineer", "Cloud Engineer", "Systems Engineer", 
        "Infrastructure Engineer", "Embedded Systems Engineer", "Mobile Developer",
        "iOS Developer", "Android Developer", "Blockchain Developer", "AR/VR Engineer",
        "Quantum Computing Engineer", "Platform Engineer", "Graphics Engineer",
        "Engineering Manager", "Director of Engineering", "VP of Engineering", "CTO"
    ],
    "Data & AI": [
        "Data Scientist", "Senior Data Scientist", "Lead Data Scientist", "Principal Data Scientist",
        "Data Engineer", "Machine Learning Engineer", "AI Researcher", "Research Scientist", 
        "Applied Scientist", "Data Analyst", "Business Intelligence Analyst", "Data Architect", 
        "AI Ethics Researcher", "ML Ops Engineer", "Computer Vision Engineer", "NLP Engineer",
        "Quantitative Analyst", "Decision Scientist", "AI Product Manager"
    ],
    "Product & Design": [
        "Product Manager", "Senior Product Manager", "Group Product Manager", "Director of Product",
        "VP of Product", "Chief Product Officer", "Product Designer", "UX Designer", "UI Designer", 
        "UX/UI Designer", "UX Researcher", "Interaction Designer", "Visual Designer", 
        "User Experience Architect", "Creative Director", "Product Marketing Manager",
        "Technical Product Manager", "Growth Product Manager"
    ],
    "Security & IT": [
        "Security Engineer", "Information Security Analyst", "Cybersecurity Specialist",
        "Security Architect", "Penetration Tester", "Security Operations Engineer",
        "Compliance Specialist", "IT Manager", "Systems Administrator", "Network Engineer",
        "IT Support Specialist", "Database Administrator", "CISO", "IT Director"
    ],
    "Technical Support": [
        "Technical Writer", "Technical Support Engineer", "Solutions Architect", 
        "Technical Account Manager", "Customer Success Engineer", "Developer Advocate", 
        "Developer Relations Engineer", "Technical Evangelist", "Solutions Engineer",
        "Implementation Specialist", "Sales Engineer"
    ]
}

In [7]:
# Flatten job titles while preserving department
all_job_titles = []
job_to_department = {}
for department, titles in tech_job_titles.items():
    for title in titles:
        all_job_titles.append(title)
        job_to_department[title] = department

In [8]:
# Expanded tech skills with categories
tech_skills = {
    "Programming Languages": [
        "Python", "JavaScript", "TypeScript", "Java", "C++", "C#", "Go", "Rust", "Swift", 
        "Kotlin", "PHP", "Ruby", "Scala", "Dart", "R", "MATLAB", "Objective-C",
        "Haskell", "Julia", "Elixir", "Clojure", "Groovy", "Perl", "Lua"
    ],
    "Frontend": [
        "React", "Angular", "Vue.js", "Next.js", "Svelte", "Redux", "HTML5", "CSS3", 
        "SASS/SCSS", "Tailwind CSS", "Bootstrap", "jQuery", "Webpack", "Vite", 
        "Responsive Design", "Web Accessibility", "PWA", "WebGL", "D3.js", "Three.js"
    ],
    "Backend": [
        "Node.js", "Django", "Flask", "Spring Boot", "Express.js", "Ruby on Rails", 
        "Laravel", "ASP.NET", "FastAPI", "NestJS", "GraphQL", "REST API", "SOAP",
        "Microservices", "Serverless", "gRPC", "WebSockets", "OAuth", "JWT"
    ],
    "Database & Storage": [
        "SQL", "PostgreSQL", "MySQL", "MongoDB", "Cassandra", "Redis", "Elasticsearch", 
        "DynamoDB", "Firebase", "Neo4j", "MariaDB", "SQLite", "Oracle DB", "Microsoft SQL Server", 
        "Supabase", "CouchDB", "InfluxDB", "Snowflake", "Redshift", "BigQuery", "Ceph"
    ],
    "DevOps & Cloud": [
        "AWS", "Azure", "Google Cloud", "Docker", "Kubernetes", "Terraform", "CI/CD", 
        "Jenkins", "GitHub Actions", "GitLab CI", "CircleCI", "Ansible", "Chef", "Puppet",
        "Prometheus", "Grafana", "ELK Stack", "Istio", "Helm", "CloudFormation", "Pulumi"
    ],
    "Data Science & AI": [
        "TensorFlow", "PyTorch", "scikit-learn", "Pandas", "NumPy", "Keras", "Hugging Face",
        "JAX", "ONNX", "LangChain", "Spark", "Hadoop", "Airflow", "Luigi", "Databricks",
        "Dask", "Ray", "Transformers", "NLTK", "spaCy", "Computer Vision", "NLP", "MLOps",
        "Reinforcement Learning", "GAN", "Diffusion Models", "LLMs", "RAG"
    ],
    "Mobile & Edge": [
        "iOS Development", "Android Development", "React Native", "Flutter", "Xamarin",
        "SwiftUI", "Kotlin Multiplatform", "Unity", "Unreal Engine", "ARKit", "ARCore",
        "Jetpack Compose", "Core ML", "TensorFlow Lite", "Bluetooth LE", "PWA", "Capacitor"
    ],
    "Version Control & Tools": [
        "Git", "GitHub", "GitLab", "Bitbucket", "Jira", "Confluence", "Notion", "Trello",
        "Asana", "Slack", "Microsoft Teams", "Linear", "Figma", "Sketch", "Postman", "cURL",
        "VS Code", "IntelliJ", "PyCharm", "Jupyter Notebook", "Colab", "Obsidian"
    ],
    "Methodologies & Practices": [
        "Agile", "Scrum", "Kanban", "DevOps", "CI/CD", "TDD", "BDD", "DDD", "XP",
        "Microservices Architecture", "Distributed Systems", "System Design", "Clean Code",
        "Design Patterns", "SOLID Principles", "Functional Programming", "Reactive Programming"
    ],
    "Specialized": [
        "Blockchain", "Web3", "Solidity", "Cryptography", "Cybersecurity", "Penetration Testing",
        "AR/VR", "Game Development", "Embedded Systems", "IoT", "Quantum Computing",
        "Bioinformatics", "Robotics", "High-Performance Computing", "Prompt Engineering",
        "Ethical Hacking", "Big Data", "Speech Recognition"
    ]
}

# Flatten skills while preserving category
all_skills = []
skill_to_category = {}
for category, skills in tech_skills.items():
    for skill in skills:
        all_skills.append(skill)
        skill_to_category[skill] = category

# Educational institutions with tiers
tech_institutions = {
    "Top US": [
        "Stanford University", "Massachusetts Institute of Technology", "Harvard University", 
        "University of California, Berkeley", "Carnegie Mellon University", "California Institute of Technology", 
        "University of Washington", "Georgia Institute of Technology", "University of Illinois at Urbana-Champaign", 
        "Cornell University", "Princeton University", "University of Michigan", "University of Texas at Austin"
    ],
    "Other US": [
        "Columbia University", "University of California, Los Angeles", "University of Wisconsin-Madison", 
        "University of California, San Diego", "Purdue University", "University of Pennsylvania", 
        "University of Southern California", "Yale University", "University of Maryland", "New York University",
        "University of Chicago", "Rice University", "University of Minnesota", "Ohio State University",
        "Arizona State University", "Virginia Tech", "University of Colorado Boulder", "Boston University"
    ],
    "International Elite": [
        "University of Cambridge", "University of Oxford", "ETH Zurich", "Imperial College London",
        "National University of Singapore", "Tsinghua University", "Peking University", "University of Toronto",
        "University of British Columbia", "University of Tokyo", "Technical University of Munich",
        "École Polytechnique Fédérale de Lausanne (EPFL)", "University of Melbourne", "University of Sydney"
    ],
    "Other International": [
        "University of Waterloo", "McGill University", "University of Hong Kong", "Seoul National University",
        "Technion – Israel Institute of Technology", "Indian Institute of Technology Bombay", 
        "Indian Institute of Technology Delhi", "University of Edinburgh", "University of Amsterdam",
        "University of Copenhagen", "KTH Royal Institute of Technology", "Delft University of Technology",
        "KAIST", "Nanyang Technological University", "University of São Paulo", "University of Cape Town"
    ],
    "Online & Bootcamps": [
        "Coursera", "edX", "Udacity", "freeCodeCamp", "Lambda School (Bloom Institute of Technology)", 
        "App Academy", "Hack Reactor", "General Assembly", "Flatiron School", "Le Wagon",
        "Springboard", "Codecademy", "DataCamp", "Pluralsight", "Khan Academy", "Udemy"
    ]
}

In [9]:
# Flatten institutions while preserving category
all_institutions = []
institution_to_category = {}
for category, institutions in tech_institutions.items():
    for institution in institutions:
        all_institutions.append(institution)
        institution_to_category[institution] = category

In [10]:
# Expanded degrees and fields of study
degrees = {
    "Undergraduate": [
        "Bachelor of Science", "Bachelor of Arts", "Bachelor of Engineering", 
        "Bachelor of Applied Science", "Bachelor of Technology", "Bachelor of Computer Science",
        "Associate's Degree", "Bachelor of Fine Arts", "Bachelor of Business Administration"
    ],
    "Graduate": [
        "Master of Science", "Master of Engineering", "Master of Computer Science",
        "Master of Business Administration", "Master of Arts", "Master of Information Technology",
        "Master of Data Science", "Master of Artificial Intelligence", "Master of Cybersecurity"
    ],
    "Doctorate": [
        "Doctor of Philosophy (Ph.D.)", "Doctor of Science", "Doctor of Engineering",
        "Doctor of Information Technology", "Doctor of Business Administration"
    ],
    "Certificate": [
        "Professional Certificate", "Graduate Certificate", "Specialized Certificate",
        "Technical Certificate", "Certificate Program", "Nanodegree", "Professional Diploma"
    ]
}

In [11]:
# Flatten degrees while preserving category
all_degrees = []
degree_to_category = {}
for category, degree_list in degrees.items():
    for degree in degree_list:
        all_degrees.append(degree)
        degree_to_category[degree] = category


In [12]:
# Expanded fields of study
fields_of_study = [
    "Computer Science", "Computer Engineering", "Software Engineering", "Electrical Engineering", 
    "Information Technology", "Data Science", "Artificial Intelligence", "Machine Learning", 
    "Cybersecurity", "Information Systems", "Mathematics", "Statistics", "Physics", 
    "Cognitive Science", "Human-Computer Interaction", "Robotics", "Business Administration",
    "Computational Linguistics", "Digital Media", "Game Development", "Network Engineering",
    "Quantum Computing", "Web Development", "Mobile App Development", "Cloud Computing",
    "Bioinformatics", "Computational Biology", "Business Analytics", "Management Information Systems",
    "Computer Graphics", "Computer Vision", "Natural Language Processing", "Systems Engineering",
    "User Experience Design", "Digital Marketing", "Product Management", "Project Management",
    "Blockchain Technology", "Internet of Things", "Embedded Systems", "Computer Architecture",
    "Telecommunications", "Information Security", "Database Management", "Big Data Analytics",
    "Distributed Systems", "High-Performance Computing", "Parallel Computing", "Quantum Information Science"
]

In [13]:
# Expanded cities and countries with regional classification
tech_cities_countries = {
    "US West Coast": [
        ("San Francisco", "United States"), ("Seattle", "United States"), 
        ("Los Angeles", "United States"), ("San Jose", "United States"), 
        ("Mountain View", "United States"), ("Palo Alto", "United States"),
        ("Menlo Park", "United States"), ("Cupertino", "United States"), 
        ("Portland", "United States"), ("San Diego", "United States"),
        ("Bellevue", "United States"), ("Redmond", "United States"),
        ("Oakland", "United States"), ("Santa Monica", "United States"),
        ("Sunnyvale", "United States"), ("Irvine", "United States")
    ],
    "US Other": [
        ("New York", "United States"), ("Austin", "United States"), 
        ("Boston", "United States"), ("Chicago", "United States"),
        ("Denver", "United States"), ("Atlanta", "United States"),
        ("Dallas", "United States"), ("Washington DC", "United States"),
        ("Miami", "United States"), ("Boulder", "United States"),
        ("Pittsburgh", "United States"), ("Raleigh", "United States"),
        ("Salt Lake City", "United States"), ("Nashville", "United States"),
        ("Minneapolis", "United States"), ("Phoenix", "United States"),
        ("Detroit", "United States"), ("Philadelphia", "United States")
    ],
    "Canada": [
        ("Toronto", "Canada"), ("Vancouver", "Canada"), ("Montreal", "Canada"),
        ("Ottawa", "Canada"), ("Waterloo", "Canada"), ("Calgary", "Canada"),
        ("Edmonton", "Canada"), ("Quebec City", "Canada")
    ],
    "Europe": [
        ("London", "United Kingdom"), ("Berlin", "Germany"), ("Amsterdam", "Netherlands"),
        ("Paris", "France"), ("Dublin", "Ireland"), ("Zurich", "Switzerland"),
        ("Stockholm", "Sweden"), ("Helsinki", "Finland"), ("Oslo", "Norway"),
        ("Copenhagen", "Denmark"), ("Madrid", "Spain"), ("Barcelona", "Spain"),
        ("Lisbon", "Portugal"), ("Munich", "Germany"), ("Brussels", "Belgium"),
        ("Vienna", "Austria"), ("Warsaw", "Poland"), ("Prague", "Czech Republic"),
        ("Milan", "Italy"), ("Rome", "Italy"), ("Edinburgh", "United Kingdom"),
        ("Cambridge", "United Kingdom"), ("Oxford", "United Kingdom")
    ],
    "Asia": [
        ("Bangalore", "India"), ("Hyderabad", "India"), ("Pune", "India"),
        ("Mumbai", "India"), ("Delhi", "India"), ("Tokyo", "Japan"),
        ("Seoul", "South Korea"), ("Singapore", "Singapore"),
        ("Hong Kong", "China"), ("Shanghai", "China"), ("Beijing", "China"),
        ("Shenzhen", "China"), ("Taipei", "Taiwan"), ("Guangzhou", "China"),
        ("Manila", "Philippines"), ("Jakarta", "Indonesia"), ("Kuala Lumpur", "Malaysia"),
        ("Bangkok", "Thailand"), ("Ho Chi Minh City", "Vietnam")
    ],
    "Other Regions": [
        ("Sydney", "Australia"), ("Melbourne", "Australia"), ("Brisbane", "Australia"),
        ("Tel Aviv", "Israel"), ("Dubai", "United Arab Emirates"), ("Abu Dhabi", "United Arab Emirates"),
        ("Cape Town", "South Africa"), ("Johannesburg", "South Africa"), ("Nairobi", "Kenya"),
        ("Lagos", "Nigeria"), ("São Paulo", "Brazil"), ("Rio de Janeiro", "Brazil"),
        ("Buenos Aires", "Argentina"), ("Mexico City", "Mexico"), ("Santiago", "Chile"),
        ("Auckland", "New Zealand"), ("Wellington", "New Zealand")
    ]
}

In [14]:
# Flatten cities and countries while preserving region
all_cities_countries = []
location_to_region = {}
for region, locations in tech_cities_countries.items():
    for location in locations:
        all_cities_countries.append(location)
        location_to_region[location] = region


In [15]:
# Employment types
employment_types = [
    "Full-time", "Part-time", "Contract", "Internship", "Freelance", 
    "Apprenticeship", "Self-employed", "Temporary"
]

In [16]:
# Remote status
remote_statuses = ["Onsite", "Hybrid", "Remote"]

In [17]:
# Experience levels
experience_levels = {
    "Entry": ["Entry-level", "Junior"],
    "Mid": ["Mid-level", "Intermediate"],
    "Senior": ["Senior", "Lead", "Staff"],
    "Leadership": ["Principal", "Director", "Head of", "Executive", "Chief"]
}

In [18]:
# Flatten experience levels
all_experience_levels = []
for category, levels in experience_levels.items():
    all_experience_levels.extend(levels)

In [19]:
# Helper function to determine title level
def get_title_level(title):
    if any(keyword in title for keyword in ["Chief", "CEO", "CTO", "CFO", "COO", "VP", "Head", "Director"]):
        return 4
    elif any(keyword in title for keyword in ["Principal", "Lead", "Senior Staff", "Staff", "Architect"]):
        return 3
    elif any(keyword in title for keyword in ["Senior", "Manager", "Sr."]):
        return 2
    elif not any(keyword in title.lower() for keyword in ["junior", "intern", "associate"]):
        return 1
    else:
        return 0

# Helper function to get title for level
def get_title_for_level(level):
    if level == 4:
        return random.choice(["Chief Technology Officer", "VP of Engineering", "Director of Engineering", 
                             "VP of Product", "Head of Data Science", "Director of Product"])
    elif level == 3:
        return random.choice(["Principal Engineer", "Lead Software Engineer", "Staff Engineer",
                             "Engineering Manager", "Technical Lead", "Product Lead", "Staff Data Scientist"])
    elif level == 2:
        return random.choice(["Senior Software Engineer", "Senior Data Scientist", "Senior Product Manager",
                             "Senior UX Designer", "Senior Frontend Developer", "Senior Backend Developer"])
    elif level == 1:
        return random.choice(["Software Engineer", "Data Scientist", "Product Manager", 
                             "UX Designer", "Frontend Developer", "Backend Developer"])
    else:
        return random.choice(["Junior Software Engineer", "Associate Developer", "Software Engineering Intern",
                             "Junior Data Scientist", "Product Analyst", "UX Design Intern"])

In [20]:
# Generate realistic job accomplishments
def generate_job_accomplishments(title, level, skills):
    accomplishments = []
    
    # Technical accomplishments
    tech_templates = [
        f"Developed {random.choice(['a new', 'an innovative', 'a scalable', 'a high-performance'])} {random.choice(['feature', 'system', 'service', 'application', 'tool'])} using {', '.join(random.sample(skills, min(2, len(skills))))}",
        f"Implemented {random.choice(['optimizations', 'improvements', 'enhancements'])} that {random.choice(['reduced latency by', 'increased throughput by', 'improved efficiency by'])} {random.randint(20, 95)}%",
        f"Built {random.choice(['and maintained', 'and deployed', 'and architected'])} {random.choice(['microservices', 'APIs', 'data pipelines', 'frontend components', 'ML models'])} for {random.choice(['critical', 'customer-facing', 'internal', 'core'])} applications",
        f"Refactored {random.choice(['legacy code', 'monolithic application', 'outdated systems'])} to {random.choice(['improve maintainability', 'enhance performance', 'reduce technical debt', 'enable new features'])}"
    ]
    
    # Leadership accomplishments
    leadership_templates = [
        f"Led a team of {random.randint(3, 15)} {random.choice(['engineers', 'developers', 'professionals', 'specialists'])} to {random.choice(['deliver key features', 'complete critical projects', 'implement new systems', 'launch new products'])}",
        f"Mentored {random.randint(2, 10)} {random.choice(['junior engineers', 'new team members', 'interns', 'colleagues'])} on {random.choice(['best practices', 'technical skills', 'career development', 'system architecture'])}",
        f"Collaborated with {random.choice(['product teams', 'cross-functional teams', 'stakeholders', 'clients'])} to {random.choice(['define requirements', 'prioritize features', 'plan releases', 'deliver solutions'])}",
        f"Drove {random.choice(['technical roadmap', 'architecture decisions', 'adoption of new technologies', 'implementation strategy'])} for {random.choice(['team', 'department', 'organization', 'product line'])}"
    ]
    
    # Project accomplishments
    project_templates = [
        f"Successfully {random.choice(['delivered', 'launched', 'implemented', 'completed'])} {random.choice(['key projects', 'critical initiatives', 'major features', 'system migration'])} {random.choice(['ahead of schedule', 'under budget', 'with high quality', 'with positive customer feedback'])}",
        f"Architected and implemented {random.choice(['a scalable', 'a robust', 'an efficient', 'a maintainable'])} solution for {random.choice(['data processing', 'user authentication', 'content delivery', 'real-time analytics', 'recommendation system'])}",
        f"Reduced {random.choice(['system downtime', 'error rates', 'infrastructure costs', 'page load times'])} by {random.randint(20, 90)}% through {random.choice(['optimization', 'refactoring', 'innovative solutions', 'architectural improvements'])}",
        f"Designed and built {random.choice(['internal tools', 'automation systems', 'testing frameworks', 'monitoring solutions'])} that {random.choice(['improved team productivity', 'enhanced code quality', 'streamlined workflows', 'reduced manual effort'])}"
    ]
    
    # Data/AI accomplishments (for data-related roles)
    data_ai_templates = [
        f"Developed {random.choice(['machine learning models', 'data pipelines', 'recommendation systems', 'predictive algorithms'])} that {random.choice(['increased user engagement', 'improved accuracy', 'reduced processing time', 'enhanced decision-making'])} by {random.randint(15, 85)}%",
        f"Implemented {random.choice(['data infrastructure', 'analytics dashboards', 'ETL processes', 'real-time data processing'])} to {random.choice(['support business decisions', 'track key metrics', 'enable data-driven features', 'improve user experience'])}",
        f"Created {random.choice(['custom algorithms', 'optimization techniques', 'statistical methods', 'data visualization tools'])} for {random.choice(['complex business problems', 'user behavior analysis', 'performance monitoring', 'anomaly detection'])}",
        f"Built and deployed {random.choice(['production ML systems', 'NLP models', 'computer vision solutions', 'recommendation engines'])} using {', '.join(random.sample(skills, min(2, len(skills))))}"
    ]
    
    # Select appropriate templates based on job title
    all_templates = []
    
    # Everyone gets technical accomplishments
    all_templates.extend(tech_templates)
    
    # Leadership roles get leadership accomplishments
    if level >= 2 or any(word in title.lower() for word in ["lead", "manager", "director", "head", "chief", "senior", "principal"]):
        all_templates.extend(leadership_templates)
    
    # Everyone gets project accomplishments
    all_templates.extend(project_templates)
    
    # Data/AI roles get data accomplishments
    if any(word in title.lower() for word in ["data", "machine learning", "ml", "ai", "analytics", "scientist"]):
        all_templates.extend(data_ai_templates)
    
    # Generate 3-5 unique accomplishments
    num_accomplishments = random.randint(3, 5)
    if len(all_templates) <= num_accomplishments:
        return all_templates
    
    return random.sample(all_templates, num_accomplishments)

In [21]:
# Generate realistic LinkedIn profiles
def generate_linkedin_profiles(num_profiles):
    profiles = []
    
    # Create a distribution of experience levels - weighted toward mid-level and senior
    experience_distribution = ["Entry"] * 25 + ["Mid"] * 40 + ["Senior"] * 25 + ["Leadership"] * 10
    
    for i in range(num_profiles):
        profile_id = str(uuid.uuid4())
        
        # Generate demographic info
        first_name = fake.first_name()
        last_name = fake.last_name()
        full_name = f"{first_name} {last_name}"
        
        # Create a unique, realistic LinkedIn URL with occasional numbers
        if random.random() < 0.3:
            url_suffix = str(random.randint(1, 999))
        else:
            url_suffix = ""
            
        # Format URL with various possible patterns
        url_pattern = random.choice([
            f"{first_name.lower()}-{last_name.lower()}{url_suffix}",
            f"{first_name.lower()}.{last_name.lower()}{url_suffix}",
            f"{last_name.lower()}.{first_name.lower()[0]}{url_suffix}",
            f"{first_name.lower()[0]}{last_name.lower()}{url_suffix}"
        ])
        linkedin_url = f"https://linkedin.com/in/{url_pattern}"
        
        # Determine experience level for this profile
        experience_tier = random.choice(experience_distribution)
        
        # Select appropriate title based on experience level
        if experience_tier == "Leadership":
            department = random.choice(list(tech_job_titles.keys()))
            possible_titles = [title for title in tech_job_titles[department] 
                              if any(level in title for level in experience_levels["Leadership"])]
            if not possible_titles:  # Fallback if no leadership titles in that department
                possible_titles = ["Director of " + department, "Head of " + department, 
                                  "VP of " + department, "Chief " + department[0] + "O"]
            title = random.choice(possible_titles)
        else:
            # For non-leadership, pick from appropriate departments
            department = random.choice(list(tech_job_titles.keys()))
            dept_titles = tech_job_titles[department]
            
            # Filter titles by experience level
            level_keywords = experience_levels[experience_tier]
            matching_titles = [t for t in dept_titles if any(level in t for level in level_keywords)]
            
            # If no matches, just pick any title from that department
            title = random.choice(matching_titles if matching_titles else dept_titles)
        
        # Pick a company with appropriate size for experience level
        if experience_tier == "Leadership":
            # Leadership roles more likely at established companies
            company_category = random.choices(
                ["FAANG & Big Tech", "Enterprise & Cloud", "AI & ML Focused", "Unicorn Startups", "Emerging Tech"],
                weights=[45, 25, 15, 10, 5],
                k=1
            )[0]
        elif experience_tier == "Entry":
            # Entry level more distributed
            company_category = random.choices(
                ["FAANG & Big Tech", "Enterprise & Cloud", "AI & ML Focused", "Unicorn Startups", "Emerging Tech"],
                weights=[30, 20, 15, 20, 15],
                k=1
            )[0]
        else:
            # Mid and Senior levels relatively evenly distributed
            company_category = random.choices(
                ["FAANG & Big Tech", "Enterprise & Cloud", "AI & ML Focused", "Unicorn Startups", "Emerging Tech"],
                weights=[30, 25, 15, 20, 10],
                k=1
            )[0]
            
        company = random.choice(tech_companies[company_category])
        
        headline = f"{title} at {company}"
        
        # Location selection with regional weighting
        if company in ["Tencent", "Alibaba", "Baidu"] or random.random() < 0.1:
            region = "Asia"
        elif company in ["Spotify"] or random.random() < 0.15:
            region = "Europe"
        else:
            # Default distribution favors US tech hubs
            region = random.choices(
                ["US West Coast", "US Other", "Canada", "Europe", "Asia", "Other Regions"],
                weights=[40, 25, 5, 15, 10, 5],
                k=1
            )[0]
            
        # Get cities from selected region
        regional_locations = tech_cities_countries[region]
        city, country = random.choice(regional_locations)
        
        # Generate skills based on job title and department
        primary_skill_category = ""
        if "Engineer" in title or "Developer" in title:
            primary_skill_category = "Programming Languages"
            secondary_categories = ["DevOps & Cloud", "Version Control & Tools", "Backend", "Frontend"]
        elif "Data" in title or "Machine Learning" in title or "AI" in title:
            primary_skill_category = "Data Science & AI"
            secondary_categories = ["Programming Languages", "Database & Storage", "Version Control & Tools"]
        elif "Product" in title or "Design" in title:
            primary_skill_category = "Frontend"
            secondary_categories = ["Version Control & Tools", "Methodologies & Practices"]
        elif "Security" in title or "IT" in title:
            primary_skill_category = "DevOps & Cloud"
            secondary_categories = ["Specialized", "Database & Storage", "Programming Languages"]
        else:
            # Default mix
            primary_skill_category = random.choice(list(tech_skills.keys()))
            secondary_categories = random.sample([k for k in tech_skills.keys() if k != primary_skill_category], 3)
        
        # Select skills with weighting toward primary category
        num_skills = random.randint(5, 10)  # More realistic skill count
        primary_skills = random.sample(tech_skills[primary_skill_category], 
                                     min(len(tech_skills[primary_skill_category]), random.randint(2, 4)))
        
        secondary_skills = []
        for category in secondary_categories:
            secondary_skills.extend(random.sample(tech_skills[category], 
                                               min(len(tech_skills[category]), random.randint(1, 2))))
        
        # Combine and limit to num_skills
        profile_skills = primary_skills + secondary_skills
        if len(profile_skills) > num_skills:
            profile_skills = random.sample(profile_skills, num_skills)
        
        # Generate summary based on experience level, skills and title
        experience_years = 1
        if experience_tier == "Entry":
            experience_years = random.randint(0, 3)
        elif experience_tier == "Mid":
            experience_years = random.randint(3, 8)
        elif experience_tier == "Senior":
            experience_years = random.randint(8, 15)
        else:  # Leadership
            experience_years = random.randint(12, 25)
        
        # Key skills to highlight (limited subset)
        key_skills = random.sample(profile_skills, min(3, len(profile_skills)))
        
        # Summary templates by experience level
        if experience_tier == "Entry":
            summary_templates = [
                f"Recent graduate with {experience_years}+ years of experience in {', '.join(key_skills)}. Excited to build innovative solutions in the tech space.",
                f"Passionate {title} with a strong foundation in {', '.join(key_skills)}. Eager to grow and learn in the technology industry.",
                f"Emerging tech professional focused on {random.choice(key_skills)}. Graduated from {random.choice(all_institutions)} with a background in {random.choice(fields_of_study)}.",
                f"Junior {title.split()[-1]} with hands-on experience in {', '.join(key_skills)}. Looking to solve challenging problems and continue developing my skills."
            ]
        elif experience_tier == "Mid":
            summary_templates = [
                f"Tech professional with {experience_years}+ years of experience specializing in {', '.join(key_skills)}. Proven track record of delivering high-quality solutions.",
                f"Experienced {title} with expertise in {', '.join(key_skills)}. Passionate about building scalable, user-centric products that solve real problems.",
                f"Versatile {title} with {experience_years}+ years in the tech industry. Skilled in {', '.join(key_skills)} with a focus on driving innovation.",
                f"{title} with a strong background in {random.choice(key_skills)} and {random.choice(key_skills)}. Committed to engineering excellence and continuous learning."
            ]
        elif experience_tier == "Senior":
            summary_templates = [
                f"Seasoned {title} with {experience_years}+ years of experience leading teams and projects. Expert in {', '.join(key_skills)} with a proven track record of success.",
                f"Senior tech professional specializing in {', '.join(key_skills)}. {experience_years}+ years of experience building complex systems and mentoring junior engineers.",
                f"Experienced {title} with deep expertise in {', '.join(key_skills)}. Passionate about solving challenging technical problems and delivering impactful solutions.",
                f"Technical leader with {experience_years}+ years of industry experience. Strong background in {', '.join(key_skills)} and a focus on driving innovation."
            ]
        else:  # Leadership
            summary_templates = [
                f"Technology leader with {experience_years}+ years of experience. Expert in {', '.join(key_skills)} with a track record of building high-performance teams and delivering transformative products.",
                f"Visionary {title} with {experience_years}+ years of experience leading teams and driving innovation. Deep expertise in {', '.join(key_skills)} and strategic planning.",
                f"Executive technology professional with extensive experience in {', '.join(key_skills)}. {experience_years}+ years of leadership in the tech industry, focused on organizational growth and excellence.",
                f"Accomplished {title} with a proven track record of success. {experience_years}+ years of experience leading teams and initiatives in {company_category} companies."
            ]
            
        summary = random.choice(summary_templates)
        
        profiles.append({
            "profile_id": profile_id,
            "first_name": first_name,
            "last_name": last_name,
            "full_name": full_name,
            "linkedin_url": linkedin_url,
            "headline": headline,
            "city": city,
            "country": country,
            "skills": ", ".join(profile_skills),
            "summary": summary,
            "experience_level": experience_tier  # Adding this for reference
        })
    
    return pd.DataFrame(profiles)

In [22]:
# Generate work experiences with more realistic career progression
def generate_work_experiences(profiles_df):
    experiences = []
    
    for _, profile in profiles_df.iterrows():
        # Determine number of experiences based on experience level
        if profile["experience_level"] == "Entry":
            num_experiences = random.randint(1, 2)
        elif profile["experience_level"] == "Mid":
            num_experiences = random.randint(2, 4)
        elif profile["experience_level"] == "Senior":
            num_experiences = random.randint(3, 5)
        else:  # Leadership
            num_experiences = random.randint(4, 7)
        
        # Start from now and work backwards
        current_date = datetime.now()
        
        # Get current company and title from headline
        headline_parts = profile["headline"].split(" at ")
        current_title = headline_parts[0] if len(headline_parts) > 1 else ""
        current_company = headline_parts[1] if len(headline_parts) > 1 else ""
        
        # Track career progression
        prev_title_level = get_title_level(current_title)
        
        for j in range(num_experiences):
            experience_id = str(uuid.uuid4())
            
            # For the first experience, use the current job from the profile
            if j == 0:
                title = current_title
                company = current_company
                
                # Duration at current job (more realistic based on level)
                if profile["experience_level"] == "Entry":
                    duration_months = random.randint(3, 24)
                elif profile["experience_level"] == "Mid":
                    duration_months = random.randint(12, 36)
                elif profile["experience_level"] == "Senior":
                    duration_months = random.randint(24, 60)
                else:  # Leadership
                    duration_months = random.randint(24, 84)
                
                # 30% chance they're still at their current job
                is_current = random.random() < 0.7
            else:
                # Previous jobs should show career progression
                # As we go back in time, decrease title level
                if prev_title_level > 0 and random.random() < 0.7:
                    prev_title_level -= 1
                
                # Get appropriate title for that level
                title = get_title_for_level(prev_title_level)
                
                # 80% chance of different company for previous roles
                if random.random() < 0.8:
                    company = random.choice(all_companies)
                
                # Duration at previous jobs
                duration_months = random.randint(6, 48)
                is_current = False
            
            end_date = None if is_current else (current_date - timedelta(days=random.randint(0, 30)))
            start_date = end_date - timedelta(days=30*duration_months) if end_date else (current_date - timedelta(days=30*duration_months))
            
            # Format dates
            start_date_str = start_date.strftime("%Y-%m-%d")
            end_date_str = None if is_current else end_date.strftime("%Y-%m-%d")
            
            # Move current_date back for the next job (with a gap of 0-3 months)
            if end_date:
                current_date = end_date - timedelta(days=random.randint(0, 90))
            
            # Location (80% chance it's the same as profile location)
            if random.random() < 0.8:
                city, country = profile["city"], profile["country"]
            else:
                city, country = random.choice(all_cities_countries)
            
            # Employment type based on title
            if "Intern" in title or "Apprentice" in title:
                emp_type = "Internship" if "Intern" in title else "Apprenticeship"
            elif "Contract" in title or "Freelance" in title:
                emp_type = "Contract" if "Contract" in title else "Freelance"
            elif "Part" in title:
                emp_type = "Part-time"
            else:
                emp_type = "Full-time"
            
            # Generate description
            profile_skills = profile["skills"].split(", ")
            skills_used = random.sample(profile_skills, min(random.randint(2, 4), len(profile_skills)))
            
            # Generate realistic accomplishments based on title and level
            accomplishments = generate_job_accomplishments(title, prev_title_level, skills_used)
            
            description = "\n".join([f"• {accomplishment}" for accomplishment in accomplishments])
            
            experiences.append({
                "experience_id": experience_id,
                "profile_id": profile["profile_id"],
                "title": title,
                "company": company,
                "start_date": start_date_str,
                "end_date": end_date_str,
                "city": city,
                "country": country,
                "description": description,
                "employment_type": emp_type
            })
    
    return pd.DataFrame(experiences)

In [23]:
# Generate educations with realistic progression
def generate_educations(profiles_df):
    educations = []
    
    for _, profile in profiles_df.iterrows():
        # Determine number of educations based on experience level
        if profile["experience_level"] == "Entry":
            num_educations = random.randint(1, 2)
        elif profile["experience_level"] == "Mid":
            num_educations = random.randint(1, 2)
        elif profile["experience_level"] == "Senior":
            num_educations = random.randint(1, 3)
        else:  # Leadership
            num_educations = random.randint(1, 3)
        
        # Add possibility of online education/certifications
        has_traditional = random.random() < 0.9  # 90% have traditional education
        has_online = random.random() < 0.3  # 30% have online education
        
        if has_traditional:
            traditional_count = num_educations
        else:
            traditional_count = 0
            
        if has_online:
            online_count = random.randint(1, 2)
        else:
            online_count = 0
            
        total_educations = traditional_count + online_count
        
        # Current year for reference
        current_year = datetime.now().year
        
        # Estimated graduation age based on experience level
        if profile["experience_level"] == "Entry":
            years_since_grad = random.randint(0, 3)
        elif profile["experience_level"] == "Mid":
            years_since_grad = random.randint(3, 8)
        elif profile["experience_level"] == "Senior":
            years_since_grad = random.randint(8, 15)
        else:  # Leadership
            years_since_grad = random.randint(12, 25)
            
        grad_year = current_year - years_since_grad
        
        # Traditional education
        for j in range(traditional_count):
            education_id = str(uuid.uuid4())
            
            # Institution selection based on profile location and randomness
            if j == 0:  # Most recent/highest education
                # Higher chance of elite institution for more senior profiles
                if profile["experience_level"] in ["Senior", "Leadership"] and random.random() < 0.4:
                    institution_category = random.choice(["Top US", "International Elite"])
                elif profile["country"] == "United States" and random.random() < 0.7:
                    institution_category = random.choice(["Top US", "Other US"])
                elif profile["country"] not in ["United States", "Canada"] and random.random() < 0.7:
                    # Non-US profiles more likely to have studied in their region
                    if profile["country"] in ["India", "China", "Japan", "Singapore", "South Korea"]:
                        institution_category = random.choices(
                            ["International Elite", "Other International", "Top US"], 
                            weights=[30, 50, 20], k=1)[0]
                    else:
                        institution_category = random.choices(
                            ["International Elite", "Other International", "Top US"], 
                            weights=[30, 50, 20], k=1)[0]
                else:
                    # Random selection
                    institution_category = random.choice(["Top US", "Other US", "International Elite", "Other International"])
            else:
                # Previous education often in same country/region
                if random.random() < 0.8:  # 80% chance of consistency
                    prev_institution = educations[-1]["institution_name"]
                    prev_category = institution_to_category.get(prev_institution)
                    
                    if "US" in prev_category:
                        institution_category = random.choice(["Top US", "Other US"])
                    elif prev_category == "International Elite":
                        institution_category = random.choice(["International Elite", "Other International"])
                    else:
                        institution_category = "Other International"
                else:
                    institution_category = random.choice(["Top US", "Other US", "International Elite", "Other International"])
            
            institution = random.choice(tech_institutions[institution_category])
            
            # Degree selection based on position in education history and profile level
            if j == 0:  # Most recent/highest education
                if profile["experience_level"] in ["Senior", "Leadership"] and random.random() < 0.3:
                    degree_category = random.choices(["Graduate", "Doctorate"], weights=[70, 30], k=1)[0]
                elif profile["experience_level"] in ["Mid", "Senior"] and random.random() < 0.5:
                    degree_category = "Graduate"
                else:
                    degree_category = "Undergraduate"
            else:
                # Previous education should be lower level
                prev_degree = educations[-1]["degree"]
                if "Doctor" in prev_degree or "Ph.D" in prev_degree:
                    degree_category = "Graduate"
                elif "Master" in prev_degree:
                    degree_category = "Undergraduate"
                else:
                    degree_category = "Undergraduate"  # Could be another Bachelor's or Associate's
            
            degree = random.choice(degrees[degree_category])
            
            # Field of study based on profile skills
            profile_skills = profile["skills"].split(", ")
            tech_focus = any(skill in profile_skills for skill in ["Machine Learning", "AI", "Data Science", "Deep Learning"])
            
            if tech_focus:
                field_weights = {
                    "Computer Science": 20,
                    "Data Science": 25,
                    "Artificial Intelligence": 20,
                    "Machine Learning": 15,
                    "Statistics": 10,
                    "Mathematics": 5,
                    "Computational Biology": 3,
                    "Physics": 2
                }
                weighted_fields = []
                for field, weight in field_weights.items():
                    weighted_fields.extend([field] * weight)
                field = random.choice(weighted_fields)
            else:
                field = random.choice(fields_of_study)
            
            # Calculate years based on degree type and position
            if j == 0:  # Most recent education
                if "Doctor" in degree or "Ph.D" in degree:
                    duration = random.randint(4, 6)
                    end_year = grad_year
                elif "Master" in degree:
                    duration = random.randint(1, 3)
                    end_year = grad_year
                else:  # Bachelor's or Associate's
                    duration = 4 if "Bachelor" in degree else 2
                    end_year = grad_year
            else:
                # Previous education
                prev_end_year = educations[-1]["start_year"] - random.randint(0, 2)  # Gap between educations
                
                if "Bachelor" in degree:
                    duration = 4
                    end_year = prev_end_year
                elif "Associate" in degree:
                    duration = 2
                    end_year = prev_end_year
                else:
                    duration = random.randint(1, 3)
                    end_year = prev_end_year
            
            start_year = end_year - duration
            
            educations.append({
                "education_id": education_id,
                "profile_id": profile["profile_id"],
                "institution_name": institution,
                "degree": degree,
                "field_of_study": field,
                "start_year": start_year,
                "end_year": end_year
            })
            
            # For next iteration
            grad_year = start_year
        
        # Online education/certifications
        for j in range(online_count):
            education_id = str(uuid.uuid4())
            institution = random.choice(tech_institutions["Online & Bootcamps"])
            degree = random.choice(degrees["Certificate"])
            
            # Field focused on tech skills
            profile_skills = profile["skills"].split(", ")
            if profile_skills:
                # Try to match certificate to a skill they have
                skill_field_map = {
                    "Machine Learning": ["Machine Learning", "Deep Learning", "AI Engineering"],
                    "Python": ["Python Programming", "Data Science with Python"],
                    "JavaScript": ["Full Stack Web Development", "JavaScript Development", "Frontend Development"],
                    "Cloud": ["AWS Certification", "Azure Fundamentals", "Google Cloud Platform"],
                    "Data": ["Data Science", "Big Data Analytics", "Data Engineering"],
                    "DevOps": ["DevOps Engineering", "CI/CD Pipelines", "Kubernetes Administration"],
                    "Cybersecurity": ["Ethical Hacking", "Network Security", "Information Security"]
                }
                
                potential_fields = []
                for skill in profile_skills:
                    for key, fields in skill_field_map.items():
                        if key.lower() in skill.lower():
                            potential_fields.extend(fields)
                
                if potential_fields:
                    field = random.choice(potential_fields)
                else:
                    field = random.choice(fields_of_study)
            else:
                field = random.choice(fields_of_study)
            
            # Certificate typically completed in last 5 years
            end_year = current_year - random.randint(0, 5)
            # Certificates typically take 3-12 months
            cert_months = random.randint(3, 12)
            start_year = end_year if cert_months < 12 else end_year - 1
            
            educations.append({
                "education_id": education_id,
                "profile_id": profile["profile_id"],
                "institution_name": institution,
                "degree": degree,
                "field_of_study": field,
                "start_year": start_year,
                "end_year": end_year
            })
    
    return pd.DataFrame(educations)

In [24]:
# Generate LinkedIn jobs
def generate_linkedin_jobs(num_jobs):
    jobs = []
    
    for i in range(num_jobs):
        job_id = str(uuid.uuid4())
        
        # Select company and associated category
        company_category = random.choice(list(tech_companies.keys()))
        company = random.choice(tech_companies[company_category])
        
        # Select department and title
        department = random.choice(list(tech_job_titles.keys()))
        title = random.choice(tech_job_titles[department])
        
        # Location selection - weighted toward company headquarters
        if company in ["Tencent", "Alibaba", "Baidu"]:
            region = "Asia"
        elif company in ["Spotify"]:
            region = "Europe"
        elif company in ["Samsung", "Sony"]:
            region = random.choices(["Asia", "US West Coast"], weights=[70, 30], k=1)[0]
        else:
            # Default distribution favors US tech hubs
            if company_category == "FAANG & Big Tech":
                region = random.choices(
                    ["US West Coast", "US Other", "Europe", "Asia", "Other Regions"],
                    weights=[60, 20, 10, 7, 3],
                    k=1
                )[0]
            else:
                region = random.choices(
                    ["US West Coast", "US Other", "Europe", "Asia", "Other Regions"],
                    weights=[40, 25, 15, 15, 5],
                    k=1
                )[0]
            
        # Get cities from selected region
        regional_locations = tech_cities_countries[region]
        city, country = random.choice(regional_locations)
        
        # Required skills based on job title and department
        num_required_skills = random.randint(5, 10)
        
        if department == "Engineering":
            primary_categories = ["Programming Languages", "Backend", "DevOps & Cloud"]
            secondary_categories = ["Frontend", "Database & Storage", "Version Control & Tools"]
        elif department == "Data & AI":
            primary_categories = ["Data Science & AI", "Programming Languages", "Database & Storage"]
            secondary_categories = ["DevOps & Cloud", "Backend", "Version Control & Tools"]
        elif department == "Product & Design":
            primary_categories = ["Frontend", "Version Control & Tools", "Methodologies & Practices"]
            secondary_categories = ["Programming Languages", "Mobile & Edge", "Specialized"]
        elif department == "Security & IT":
            primary_categories = ["DevOps & Cloud", "Specialized", "Security"]
            secondary_categories = ["Programming Languages", "Database & Storage", "Version Control & Tools"]
        else:  # Technical Support
            primary_categories = ["Version Control & Tools", "Programming Languages", "Methodologies & Practices"]
            secondary_categories = ["Frontend", "Backend", "DevOps & Cloud"]
            
        # Get skills from primary categories (60%)
        primary_skill_count = int(num_required_skills * 0.6)
        primary_skills = []
        for category in primary_categories:
            if category in tech_skills:
                primary_skills.extend(random.sample(
                    tech_skills[category], 
                    min(len(tech_skills[category]), primary_skill_count // len(primary_categories) + 1)
                ))
                
        # Get skills from secondary categories (40%)
        secondary_skill_count = num_required_skills - len(primary_skills)
        secondary_skills = []
        for category in secondary_categories:
            if category in tech_skills and secondary_skill_count > 0:
                category_skills = random.sample(
                    tech_skills[category],
                    min(len(tech_skills[category]), secondary_skill_count // len(secondary_categories) + 1)
                )
                secondary_skills.extend(category_skills)
                secondary_skill_count -= len(category_skills)
                
        required_skills = primary_skills + secondary_skills
        if len(required_skills) > num_required_skills:
            required_skills = random.sample(required_skills, num_required_skills)
        
        # Posting date (within the last 90 days, weighted toward recent)
        days_ago_weights = [3] * 30 + [2] * 30 + [1] * 30  # Higher weight to more recent postings
        days_ago = random.choices(range(1, 91), weights=days_ago_weights, k=1)[0]
        posting_date = (datetime.now() - timedelta(days=days_ago)).strftime("%Y-%m-%d")
        
        # Job type
        if "Intern" in title:
            job_type = "Internship"
        elif "Contract" in title:
            job_type = "Contract"
        elif "Part-time" in title:
            job_type = "Part-time"
        else:
            job_type = "Full-time"
        
        # Remote status - weighted by company size and recent trends
        if company_category in ["FAANG & Big Tech", "Enterprise & Cloud"]:
            remote_status = random.choices(
                ["Onsite", "Hybrid", "Remote"],
                weights=[20, 50, 30],
                k=1
            )[0]
        else:
            remote_status = random.choices(
                ["Onsite", "Hybrid", "Remote"],
                weights=[15, 35, 50],
                k=1
            )[0]
        
        # Experience level based on title
        if any(senior in title for senior in ["Senior", "Staff", "Principal", "Lead", "Head", "Director", "VP", "Chief"]):
            experience_level = random.choice(["Senior", "Lead", "Principal", "Executive"])
        elif any(mid in title for mid in ["II", "III", "2", "3"]):
            experience_level = "Mid-level"
        elif any(junior in title.lower() for junior in ["junior", "associate", "intern"]):
            experience_level = random.choice(["Entry-level", "Junior"])
        else:
            experience_level = random.choice(["Entry-level", "Junior", "Mid-level", "Senior"])
        
        # Description with formatting
        company_desc = generate_company_description(company, company_category)
        role_overview = generate_role_overview(title, department, company)
        responsibilities = generate_job_responsibilities(title, department, required_skills)
        qualifications = generate_job_qualifications(title, required_skills, experience_level)
        benefits = generate_job_benefits(company_category)
        
        description = f"""
{company_desc}

# Role Overview
{role_overview}

# Key Responsibilities
{responsibilities}

# Qualifications
{qualifications}

# Benefits
{benefits}

{company} is an equal opportunity employer. We celebrate diversity and are committed to creating an inclusive environment for all employees.
"""
        
        jobs.append({
            "job_id": job_id,
            "job_title": title,
            "company": company,
            "city": city,
            "country": country,
            "description": description,
            "required_skills": ", ".join(required_skills),
            "posting_date": posting_date,
            "job_type": job_type,
            "remote_status": remote_status,
            "industry": "Tech",
            "experience_level": experience_level
        })
    
    return pd.DataFrame(jobs)

In [25]:
# Generate company description
def generate_company_description(company, company_category):
    if company_category == "FAANG & Big Tech":
        templates = [
            f"{company} is a leading global technology company that's transforming how people connect, communicate, and discover information.",
            f"At {company}, we're on a mission to organize the world's information and make it universally accessible and useful.",
            f"{company} is one of the world's most valuable companies, known for innovation in hardware, software, and services."
        ]
    elif company_category == "AI & ML Focused":
        templates = [
            f"{company} is at the forefront of artificial intelligence research and development, creating cutting-edge solutions that push the boundaries of what's possible.",
            f"Founded by experts in machine learning, {company} is dedicated to creating safe and beneficial AI that solves humanity's most important challenges.",
            f"{company} is developing breakthrough AI technologies that are transforming industries and creating new possibilities for human-AI collaboration."
        ]
    elif company_category == "Unicorn Startups":
        templates = [
            f"{company} is a rapidly growing technology company valued at over $1B, disrupting traditional industries with innovative solutions.",
            f"Founded in {random.randint(2010, 2020)}, {company} has quickly become a leader in the {random.choice(['fintech', 'health tech', 'e-commerce', 'productivity', 'creator economy'])} space.",
            f"{company} is a venture-backed startup that's revolutionizing how people {random.choice(['work', 'shop', 'learn', 'communicate', 'manage finances'])}."
        ]
    elif company_category == "Enterprise & Cloud":
        templates = [
            f"{company} provides enterprise-grade cloud solutions that power thousands of organizations worldwide, from startups to Fortune 500 companies.",
            f"As a leader in enterprise software, {company} helps organizations transform their operations with powerful, scalable technology solutions.",
            f"{company} delivers critical infrastructure and software solutions that enable businesses to thrive in the digital economy."
        ]
    else:  # Emerging Tech
        templates = [
            f"{company} is pioneering the next generation of {random.choice(['space exploration', 'autonomous vehicles', 'brain-computer interfaces', 'sustainable technology', 'robotics'])}.",
            f"Founded with a mission to solve some of humanity's most challenging problems, {company} is developing breakthrough technologies in {random.choice(['renewable energy', 'space transportation', 'biotechnology', 'advanced materials', 'quantum computing'])}.",
            f"{company} is pushing the boundaries of what's possible in {random.choice(['transportation', 'neurotechnology', 'robotics', 'clean energy', 'space exploration'])}."
        ]
    
    return random.choice(templates)

# Generate role overview
def generate_role_overview(title, department, company):
    templates = [
        f"{company} is seeking a talented {title} to join our growing team. In this role, you'll work on challenging problems and contribute to products used by millions of people worldwide.",
        f"We're looking for a {title} to help us build and scale our technology. You'll collaborate with cross-functional teams to deliver exceptional solutions that meet our users' needs.",
        f"As a {title} at {company}, you'll be instrumental in developing innovative solutions that drive our business forward. This is an opportunity to work on impactful projects in a collaborative environment."
    ]
    
    return random.choice(templates)

In [26]:
# Generate job responsibilities
def generate_job_responsibilities(title, department, skills):
    # Base responsibilities by department
    base_resp = {
        "Engineering": [
            f"Design, develop, and maintain {random.choice(['scalable applications', 'robust systems', 'efficient services', 'critical infrastructure'])}",
            f"Collaborate with cross-functional teams to define and implement new features",
            f"Write clean, maintainable, and well-tested code",
            f"Participate in code reviews, architectural discussions, and technical decision-making",
            f"Troubleshoot and debug complex issues across the stack"
        ],
        "Data & AI": [
            f"Build and optimize {random.choice(['machine learning models', 'data pipelines', 'statistical models', 'recommendation systems'])}",
            f"Work with large, complex datasets and derive meaningful insights",
            f"Collaborate with engineering teams to implement and deploy models to production",
            f"Research and implement state-of-the-art algorithms and approaches",
            f"Monitor and improve model performance and data quality"
        ],
        "Product & Design": [
            f"Define product requirements, specifications, and roadmaps based on user needs and business goals",
            f"Work closely with engineering, design, and other stakeholders to deliver successful products",
            f"Analyze user feedback and market trends to inform product decisions",
            f"Lead the product development lifecycle from conception to launch",
            f"Define and track key metrics to measure product success"
        ],
        "Security & IT": [
            f"Design and implement security measures to protect systems, networks, and data",
            f"Conduct security assessments, vulnerability testing, and risk analyses",
            f"Monitor systems for security breaches and respond to incidents",
            f"Develop security policies, procedures, and best practices",
            f"Collaborate with development teams to ensure security is integrated into the development lifecycle"
        ],
        "Technical Support": [
            f"Provide technical guidance and support to customers and internal teams",
            f"Troubleshoot and resolve complex technical issues",
            f"Document solutions and contribute to knowledge base",
            f"Collaborate with product and engineering teams to improve product quality",
            f"Develop tools and processes to enhance support efficiency"
        ]
    }
    
    # Additional responsibilities based on seniority
    senior_resp = [
        f"Mentor junior team members and provide technical leadership",
        f"Contribute to architectural decisions and technical strategy",
        f"Work with stakeholders to define technical requirements and specifications",
        f"Lead projects and coordinate team efforts to meet deadlines and quality standards",
        f"Evaluate and recommend new technologies and approaches"
    ]
    
    # Get base responsibilities for the department
    department_key = next((d for d in base_resp.keys() if d in department), "Engineering")
    base = base_resp.get(department_key, base_resp["Engineering"])
    
    # Add senior responsibilities if applicable
    if any(senior in title for senior in ["Senior", "Staff", "Principal", "Lead", "Head", "Director", "VP", "Chief"]):
        all_resp = base + senior_resp
    else:
        all_resp = base
    
    # Add skill-specific responsibilities
    skill_resp = []
    for skill in random.sample(skills, min(2, len(skills))):
        if "Python" in skill or "Java" in skill or "JavaScript" in skill or "C++" in skill:
            skill_resp.append(f"Develop and maintain applications using {skill}")
        elif "ML" in skill or "AI" in skill or "Learning" in skill:
            skill_resp.append(f"Apply {skill} techniques to solve complex business problems")
        elif "Cloud" in skill or "AWS" in skill or "Azure" in skill or "GCP" in skill:
            skill_resp.append(f"Design and implement solutions using {skill} services")
        elif "Data" in skill or "SQL" in skill or "Database" in skill:
            skill_resp.append(f"Work with {skill} to store, process, and analyze information")
    
    # Combine and select 5-7 responsibilities
    all_resp = all_resp + skill_resp
    selected_resp = random.sample(all_resp, min(random.randint(5, 7), len(all_resp)))
    
    # Format as bullet points
    return "\n".join([f"• {resp}" for resp in selected_resp])

In [27]:
# Generate job qualifications
def generate_job_qualifications(title, skills, experience_level):
    # Base qualifications
    base_qual = [
        f"Bachelor's degree in Computer Science, Engineering, or related field{', or equivalent practical experience' if random.random() < 0.3 else ''}",
        f"Strong problem-solving skills and attention to detail",
        f"Excellent communication and collaboration abilities"
    ]
    
    # Experience requirement based on level
    if experience_level in ["Entry-level", "Junior"]:
        exp_years = random.randint(0, 2)
        exp_qual = f"{exp_years}+ years of relevant experience" if exp_years > 0 else "Previous internship or project experience"
    elif experience_level in ["Mid-level"]:
        exp_years = random.randint(3, 5)
        exp_qual = f"{exp_years}+ years of relevant experience"
    elif experience_level in ["Senior", "Lead"]:
        exp_years = random.randint(5, 8)
        exp_qual = f"{exp_years}+ years of relevant experience"
    else:  # Principal, Executive
        exp_years = random.randint(8, 15)
        exp_qual = f"{exp_years}+ years of relevant experience, including leadership roles"
    
    base_qual.insert(1, exp_qual)
    
    # Skill-specific qualifications
    skill_qual = []
    for skill in skills[:min(4, len(skills))]:
        skill_qual.append(f"Experience with {skill}")
    
    # Add role-specific qualifications
    role_qual = []
    if "Data" in title or "Machine Learning" in title or "AI" in title:
        role_qual.append(f"Experience with data processing, modeling, and analysis")
        role_qual.append(f"Understanding of statistical methods and machine learning algorithms")
    elif "Frontend" in title or "UI" in title or "UX" in title:
        role_qual.append(f"Strong understanding of web technologies and UI/UX principles")
        role_qual.append(f"Experience building responsive, accessible web applications")
    elif "Backend" in title or "Server" in title:
        role_qual.append(f"Experience with API design and development")
        role_qual.append(f"Understanding of database design and optimization")
    elif "Security" in title:
        role_qual.append(f"Understanding of security principles and best practices")
        role_qual.append(f"Experience with security assessment tools and techniques")
    elif "DevOps" in title or "SRE" in title or "Infrastructure" in title:
        role_qual.append(f"Experience with cloud infrastructure and deployment automation")
        role_qual.append(f"Understanding of CI/CD principles and practices")
        
    # Combine all qualifications
    all_qual = base_qual + skill_qual + role_qual
    
    # Select 6-8 qualifications
    selected_qual = random.sample(all_qual, min(random.randint(6, 8), len(all_qual)))
    
    # Format as bullet points
    return "\n".join([f"• {qual}" for qual in selected_qual])


In [28]:
# Generate job benefits
def generate_job_benefits(company_category):
    # Common benefits
    common_benefits = [
        "Competitive salary and equity compensation",
        "Comprehensive health, dental, and vision insurance",
        "Flexible work arrangements",
        "Paid time off and holidays",
        "401(k) matching"
    ]
    
    # Additional benefits by company category
    additional_benefits = {
        "FAANG & Big Tech": [
            "On-site amenities including gourmet meals and fitness centers",
            "Learning and development stipends",
            "Generous parental leave",
            "Wellness programs and gym reimbursements",
            "Transportation benefits or shuttle service"
        ],
        "AI & ML Focused": [
            "Access to cutting-edge AI research and technologies",
            "Conference and publication opportunities",
            "Learning and development stipends",
            "Collaborative work environment with leading AI researchers",
            "Flexible work arrangements"
        ],
        "Unicorn Startups": [
            "Equity in a high-growth company",
            "Startup culture with high impact opportunities",
            "Regular team events and activities",
            "Flexible work arrangements",
            "Modern, collaborative workspace"
        ],
        "Enterprise & Cloud": [
            "Professional development opportunities",
            "Work-life balance focus",
            "Global team collaboration opportunities",
            "Tuition reimbursement",
            "Employee resource groups"
        ],
        "Emerging Tech": [
            "Opportunity to work on cutting-edge technologies",
            "Collaborative, innovative work environment",
            "Flexible work arrangements",
            "Regular team building activities",
            "Professional development opportunities"
        ]
    }
    
    # Select 5 common benefits and 3 category-specific benefits
    selected_common = random.sample(common_benefits, min(4, len(common_benefits)))
    
    category_benefits = additional_benefits.get(company_category, additional_benefits["Unicorn Startups"])
    selected_category = random.sample(category_benefits, min(3, len(category_benefits)))
    
    all_benefits = selected_common + selected_category
    
    # Format as bullet points
    return "\n".join([f"• {benefit}" for benefit in all_benefits])

In [29]:
# Generate LinkedIn connections with realistic network structure
def generate_linkedin_connections(profiles_df):
    connections = []
    profile_ids = profiles_df["profile_id"].tolist()
    
    # Create a dictionary to store profile metadata for network generation
    profile_metadata = {}
    
    for _, profile in profiles_df.iterrows():
        profile_id = profile["profile_id"]
        
        # Extract company and location
        company = profile["headline"].split(" at ")[-1] if " at " in profile["headline"] else ""
        location = (profile["city"], profile["country"])
        
        # Extract skills
        skills = set(profile["skills"].split(", "))
        
        # Store in metadata dictionary
        profile_metadata[profile_id] = {
            "company": company,
            "location": location,
            "skills": skills,
            "experience_level": profile["experience_level"]
        }
    
    # Create connections based on realistic network properties
    for profile_id, metadata in profile_metadata.items():
        # Determine number of connections based on experience level
        if metadata["experience_level"] == "Entry":
            # Entry-level has fewer connections
            num_connections = random.randint(20, 100)
        elif metadata["experience_level"] == "Mid":
            # Mid-level has moderate connections
            num_connections = random.randint(80, 300)
        elif metadata["experience_level"] == "Senior":
            # Senior has many connections
            num_connections = random.randint(200, 500)
        else:  # Leadership
            # Leadership has extensive networks
            num_connections = random.randint(300, 800)
        
        # Limit connections to available profiles
        potential_connections = [pid for pid in profile_ids if pid != profile_id]
        if len(potential_connections) < num_connections:
            num_connections = len(potential_connections)
        
        # Weighted connection selection
        connection_weights = []
        for potential_id in potential_connections:
            potential_metadata = profile_metadata[potential_id]
            weight = 1.0  # Base weight
            
            # Same company connections are more likely
            if metadata["company"] == potential_metadata["company"] and metadata["company"]:
                weight *= 10.0
            
            # Same location connections are more likely
            if metadata["location"] == potential_metadata["location"]:
                weight *= 5.0
            
            # Similar skills increase connection probability
            common_skills = len(metadata["skills"].intersection(potential_metadata["skills"]))
            if common_skills > 0:
                weight *= (1.0 + common_skills / 5.0)
            
            # Experience level similarity (people tend to connect with peers)
            exp_diff = abs(experience_level_value(metadata["experience_level"]) - 
                          experience_level_value(potential_metadata["experience_level"]))
            weight *= (1.0 / (exp_diff + 1.0))
            
            connection_weights.append(weight)
        
        # Normalize weights if needed
        if connection_weights and max(connection_weights) > 0:
            connection_weights = [w / max(connection_weights) * 10 for w in connection_weights]
        
        # Select connections based on weights
        if len(potential_connections) <= num_connections:
            connected_profiles = potential_connections
        else:
            connected_profiles = random.choices(
                potential_connections, 
                weights=connection_weights, 
                k=num_connections
            )
        
        for connected_id in connected_profiles:
            # Only add connection if it doesn't exist already
            existing_connection = False
            for conn in connections:
                if (conn["profile_id_1"] == profile_id and conn["profile_id_2"] == connected_id) or \
                   (conn["profile_id_1"] == connected_id and conn["profile_id_2"] == profile_id):
                    existing_connection = True
                    break
                    
            if not existing_connection:
                connection_id = str(uuid.uuid4())
                
                # Calculate mutual connections (people in same company/location more likely to have mutual connections)
                metadata_1 = profile_metadata[profile_id]
                metadata_2 = profile_metadata[connected_id]
                
                base_mutual = 0
                # Same company increases mutual connections
                if metadata_1["company"] == metadata_2["company"] and metadata_1["company"]:
                    base_mutual += random.randint(10, 30)
                
                # Same location increases mutual connections
                if metadata_1["location"] == metadata_2["location"]:
                    base_mutual += random.randint(5, 15)
                
                # Common skills increase mutual connections
                common_skills = len(metadata_1["skills"].intersection(metadata_2["skills"]))
                base_mutual += common_skills * 2
                
                # Add some randomness
                mutual_count = max(0, int(base_mutual + random.randint(-5, 10)))
                
                connections.append({
                    "connection_id": connection_id,
                    "profile_id_1": profile_id,
                    "profile_id_2": connected_id,
                    "mutual_connections_count": mutual_count
                })
    
    return pd.DataFrame(connections)

In [30]:
# Helper function to convert experience level to numeric value for comparisons
def experience_level_value(level):
    if level == "Entry":
        return 1
    elif level == "Mid":
        return 2
    elif level == "Senior":
        return 3
    else:  # Leadership
        return 4

In [31]:
# Generate all datasets
def generate_all_datasets():
    print("Generating LinkedIn profiles...")
    profiles_df = generate_linkedin_profiles(num_profiles)
    
    print("Generating work experiences...")
    experiences_df = generate_work_experiences(profiles_df)
    
    print("Generating educations...")
    educations_df = generate_educations(profiles_df)
    
    print("Generating LinkedIn jobs...")
    jobs_df = generate_linkedin_jobs(num_jobs)
    
    print("Generating LinkedIn connections...")
    connections_df = generate_linkedin_connections(profiles_df)

    print("Saving...")
    
    # Save to CSV
    profiles_df.to_csv("linkedin_profiles.csv", index=False)
    experiences_df.to_csv("linkedin_work_experiences.csv", index=False)
    educations_df.to_csv("linkedin_educations.csv", index=False)
    jobs_df.to_csv("linkedin_jobs.csv", index=False)
    connections_df.to_csv("linkedin_connections.csv", index=False)
    
    print("All datasets generated and saved as CSV files!")
    
    # Return dataframes for further use if needed
    return {
        "profiles": profiles_df,
        "experiences": experiences_df,
        "educations": educations_df,
        "jobs": jobs_df,
        "connections": connections_df
    }

In [32]:
print("Starting generation of realistic tech industry LinkedIn datasets...")
print(f"Will generate {num_profiles} profiles, {num_jobs} job listings, and their relationships")

datasets = generate_all_datasets()

# Print counts and sample data
print("\nGeneration complete! Dataset statistics:")
print(f"- LinkedIn Profiles: {len(datasets['profiles'])} records")
print(f"- Work Experiences: {len(datasets['experiences'])} records")
print(f"- Educations: {len(datasets['educations'])} records")
print(f"- Job Listings: {len(datasets['jobs'])} records")
print(f"- Connections: {len(datasets['connections'])} records")

print("\nSample LinkedIn Profile:")
print(datasets["profiles"].iloc[0][["full_name", "headline", "summary", "city", "country", "skills"]].to_dict())

print("\nSample Work Experience:")
sample_experience = datasets["experiences"].iloc[0].to_dict()
print(f"Title: {sample_experience['title']} at {sample_experience['company']}")
print(f"Duration: {sample_experience['start_date']} to {sample_experience['end_date'] or 'Present'}")
print(f"Description: {sample_experience['description'][:200]}...")

print("\nSample Education:")
sample_education = datasets["educations"].iloc[0].to_dict()
print(f"{sample_education['degree']} in {sample_education['field_of_study']}")
print(f"Institution: {sample_education['institution_name']}")
print(f"Years: {sample_education['start_year']} - {sample_education['end_year']}")

print("\nSample Job Listing:")
sample_job = datasets["jobs"].iloc[0].to_dict()
print(f"Position: {sample_job['job_title']} at {sample_job['company']}")
print(f"Location: {sample_job['city']}, {sample_job['country']} ({sample_job['remote_status']})")
print(f"Required Skills: {sample_job['required_skills']}")

print("\nDatasets saved as CSV files in the current directory.")
print("Use these files for your GraphRAG application.")

Starting generation of realistic tech industry LinkedIn datasets...
Will generate 1000 profiles, 300 job listings, and their relationships
Generating LinkedIn profiles...
Generating work experiences...
Generating educations...
Generating LinkedIn jobs...
Generating LinkedIn connections...
Saving...
All datasets generated and saved as CSV files!

Generation complete! Dataset statistics:
- LinkedIn Profiles: 1000 records
- Work Experiences: 3138 records
- Educations: 1974 records
- Job Listings: 300 records
- Connections: 160883 records

Sample LinkedIn Profile:
{'full_name': 'Danielle Johnson', 'headline': 'Director of Product at IBM', 'summary': 'Executive technology professional with extensive experience in BDD, Vue.js, HTML5. 23+ years of leadership in the tech industry, focused on organizational growth and excellence.', 'city': 'Santa Monica', 'country': 'United States', 'skills': 'Vue.js, HTML5, VS Code, BDD'}

Sample Work Experience:
Title: Director of Product at IBM
Duration: 202