In [3]:
import pandas as pd
import random
import numpy as np

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Predefined data pools
first_names = ['Aarav', 'Vivaan', 'Aditya', 'Arjun', 'Sai', 'Arnav', 'Ayaan', 'Krishna', 'Ishaan', 'Reyansh',
               'Diya', 'Saanvi', 'Aadhya', 'Ananya', 'Navya', 'Pari', 'Aarohi', 'Anika', 'Sara', 'Myra',
               'Rohan', 'Kabir', 'Vihaan', 'Advait', 'Yash', 'Rudra', 'Atharv', 'Shaurya', 'Kiaan', 'Pranav',
               'Riya', 'Ira', 'Tara', 'Nisha', 'Priya', 'Neha', 'Shreya', 'Kavya', 'Aditi', 'Pooja']

last_names = ['Sharma', 'Verma', 'Patel', 'Kumar', 'Singh', 'Reddy', 'Gupta', 'Joshi', 'Desai', 'Mehta',
              'Nair', 'Rao', 'Iyer', 'Kulkarni', 'Pandey', 'Chopra', 'Agarwal', 'Malhotra', 'Bhatia', 'Kapoor',
              'Shah', 'Bansal', 'Saxena', 'Mishra', 'Sinha', 'Thakur', 'Tiwari', 'Jain', 'Chauhan', 'Khan']

branches = ['Computer Science', 'Information Technology', 'Electronics and Communication', 
            'Mechanical Engineering', 'Civil Engineering', 'Electrical Engineering']

technical_skills = ['Python', 'Java', 'JavaScript', 'C++', 'React', 'Node.js', 'SQL', 'MongoDB', 
                   'Machine Learning', 'Data Analysis', 'Django', 'Flask', 'HTML/CSS', 'Git', 'Docker', 'AWS', 
                   'TensorFlow', 'Pandas', 'NumPy', 'REST APIs', 'Spring Boot', 'Angular', 'Vue.js', 'PostgreSQL',
                   'Redis', 'Kubernetes', 'Jenkins', 'Scikit-learn', 'PyTorch', 'Power BI']

soft_skills = ['Communication', 'Leadership', 'Teamwork', 'Problem Solving', 'Time Management', 
              'Critical Thinking', 'Adaptability', 'Creativity', 'Collaboration', 'Analytical Thinking']

interests = ['Web Development', 'Machine Learning', 'Data Science', 'Mobile Development', 
            'Cloud Computing', 'Cybersecurity', 'Artificial Intelligence', 'Backend Development', 
            'Frontend Development', 'DevOps', 'Database Management', 'UI/UX Design']

cities = ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Chennai', 'Kolkata', 'Ahmedabad', 
          'Gurugram', 'Noida']

companies = [
    {'name': 'TechCorp Solutions', 'domain': 'Software Development'},
    {'name': 'DataMinds Analytics', 'domain': 'Data Science'},
    {'name': 'CloudNine Systems', 'domain': 'Cloud Computing'},
    {'name': 'WebCraft Digital', 'domain': 'Web Development'},
    {'name': 'AI Innovations Lab', 'domain': 'Artificial Intelligence'},
    {'name': 'SecureNet Technologies', 'domain': 'Cybersecurity'},
    {'name': 'MobileFirst Apps', 'domain': 'Mobile Development'},
    {'name': 'ByteStream Solutions', 'domain': 'Backend Development'},
    {'name': 'PixelPerfect Studios', 'domain': 'Frontend Development'},
    {'name': 'InfraCode Systems', 'domain': 'DevOps'},
    {'name': 'QuantumLeap Tech', 'domain': 'Machine Learning'},
    {'name': 'SmartData Corp', 'domain': 'Data Analysis'},
    {'name': 'NextGen Software', 'domain': 'Full Stack Development'},
    {'name': 'CodeCrafters Inc', 'domain': 'Software Engineering'},
    {'name': 'TechVista Solutions', 'domain': 'Product Development'},
    {'name': 'Innovate Systems', 'domain': 'Enterprise Solutions'},
    {'name': 'Digital Dynamics', 'domain': 'Digital Transformation'},
    {'name': 'Analytics Pro', 'domain': 'Business Intelligence'},
    {'name': 'CloudScale Tech', 'domain': 'Cloud Infrastructure'},
    {'name': 'CyberShield Corp', 'domain': 'Information Security'}
]

roles = ['Software Developer Intern', 'Data Science Intern', 'ML Engineer Intern', 
         'Web Developer Intern', 'Backend Developer Intern', 'Frontend Developer Intern',
         'Full Stack Developer Intern', 'DevOps Intern', 'Cloud Engineer Intern', 
         'Data Analyst Intern', 'Cybersecurity Intern', 'Mobile App Developer Intern']

def generate_students(num_students=50):
    """Generate synthetic student data"""
    students = []
    
    for i in range(1, num_students + 1):
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        branch = random.choice(branches)
        year = random.choice([2, 3, 4])
        cgpa = round(random.uniform(7.0, 10.0), 2)
        
        # Generate skills
        num_tech_skills = random.randint(3, 8)
        tech_skills = random.sample(technical_skills, num_tech_skills)
        
        num_soft_skills = random.randint(3, 5)
        soft_skills_list = random.sample(soft_skills, num_soft_skills)
        
        num_interests = random.randint(2, 4)
        interests_list = random.sample(interests, num_interests)
        
        location_prefs = random.sample(cities, random.randint(2, 4))
        
        # Generate projects
        num_projects = random.randint(1, 4)
        projects = [f"{random.choice(interests)} Project {j+1}" for j in range(num_projects)]
        
        students.append({
            'student_id': f'STU{str(i).zfill(3)}',
            'name': f'{first_name} {last_name}',
            'email': f'{first_name.lower()}.{last_name.lower()}@university.edu',
            'branch': branch,
            'year': year,
            'cgpa': cgpa,
            'technical_skills': ', '.join(tech_skills),
            'soft_skills': ', '.join(soft_skills_list),
            'interests': ', '.join(interests_list),
            'location_preferences': ', '.join(location_prefs),
            'past_projects': '; '.join(projects),
            'previous_experience': 'Yes' if random.random() > 0.7 else 'No'
        })
    
    return pd.DataFrame(students)

def generate_internships(num_internships=20):
    """Generate synthetic internship data"""
    internships = []
    used_companies = []
    
    for i in range(1, num_internships + 1):
        # Select company (avoid duplicates if possible)
        available_companies = [c for c in companies if c['name'] not in used_companies]
        if not available_companies:
            available_companies = companies
        
        company = random.choice(available_companies)
        used_companies.append(company['name'])
        
        role = random.choice(roles)
        location = random.choice(cities)
        positions = random.randint(1, 5)
        stipend = random.randint(10, 50) * 1000
        duration = random.choice([2, 3, 6])
        
        # Generate required and preferred skills
        num_required = random.randint(3, 6)
        required_skills = random.sample(technical_skills, num_required)
        
        remaining_skills = [s for s in technical_skills if s not in required_skills]
        num_preferred = random.randint(2, 4)
        preferred_skills = random.sample(remaining_skills, min(num_preferred, len(remaining_skills)))
        
        min_cgpa = round(random.uniform(6.0, 8.0), 1)
        preferred_branches = random.sample(branches, random.randint(2, 4))
        
        internships.append({
            'internship_id': f'INT{str(i).zfill(3)}',
            'company_name': company['name'],
            'role': role,
            'domain': company['domain'],
            'location': location,
            'positions_available': positions,
            'stipend': stipend,
            'duration_months': duration,
            'required_skills': ', '.join(required_skills),
            'preferred_skills': ', '.join(preferred_skills),
            'min_cgpa': min_cgpa,
            'preferred_branches': ', '.join(preferred_branches),
            'experience_required': 'Preferred' if random.random() > 0.6 else 'Not Required'
        })
    
    return pd.DataFrame(internships)

# Generate datasets
print("Generating student data...")
students_df = generate_students(num_students=1500)

print("Generating internship data...")
internships_df = generate_internships(num_internships=150)

# Save to CSV files
students_df.to_csv('students_data.csv', index=False)
internships_df.to_csv('internships_data.csv', index=False)

print(f"\n✓ Generated {len(students_df)} student records")
print(f"✓ Generated {len(internships_df)} internship records")
print("\nFiles saved:")
print("  - students_data.csv")
print("  - internships_data.csv")

# Display sample data
print("\n" + "="*80)
print("SAMPLE STUDENT DATA (First 5 records)")
print("="*80)
print(students_df.head().to_string())

print("\n" + "="*80)
print("SAMPLE INTERNSHIP DATA (First 5 records)")
print("="*80)
print(internships_df.head().to_string())

# Display summary statistics
print("\n" + "="*80)
print("DATA SUMMARY")
print("="*80)
print("\nStudent Statistics:")
print(f"  - Total Students: {len(students_df)}")
print(f"  - Average CGPA: {students_df['cgpa'].mean():.2f}")
print(f"  - CGPA Range: {students_df['cgpa'].min():.2f} - {students_df['cgpa'].max():.2f}")
print(f"  - Branch Distribution:")
for branch, count in students_df['branch'].value_counts().items():
    print(f"    • {branch}: {count}")

print("\nInternship Statistics:")
print(f"  - Total Internships: {len(internships_df)}")
print(f"  - Total Positions: {internships_df['positions_available'].sum()}")
print(f"  - Average Stipend: ₹{internships_df['stipend'].mean():.0f}")
print(f"  - Stipend Range: ₹{internships_df['stipend'].min()} - ₹{internships_df['stipend'].max()}")
print(f"  - Location Distribution:")
for location, count in internships_df['location'].value_counts().items():
    print(f"    • {location}: {count}")

Generating student data...
Generating internship data...

✓ Generated 1500 student records
✓ Generated 150 internship records

Files saved:
  - students_data.csv
  - internships_data.csv

SAMPLE STUDENT DATA (First 5 records)
  student_id            name                          email                         branch  year  cgpa                                                     technical_skills                                                                     soft_skills                                                interests                   location_preferences                                                                                               past_projects previous_experience
0     STU001  Krishna Sharma  krishna.sharma@university.edu         Electrical Engineering     3  7.73                                    PostgreSQL, C++, Angular, PyTorch              Leadership, Adaptability, Communication, Creativity, Collaboration                 Mobile Development, Frontend D

In [4]:
# Save to CSV files
students_df.to_csv('students_data.csv', index=False, encoding='utf-8-sig')
internships_df.to_csv('internships_data.csv', index=False, encoding='utf-8-sig')

In [5]:
import os 
print(os.getcwd())
print(os.listdir())

c:\Users\apurv\OneDrive\Desktop\SIH
['datasetgeneration.ipynb', 'internships_data.csv', 'students_data.csv']
