# Researching the data

In [19]:
import pandas as pd

In [20]:
df = pd.read_csv('data/synthetic_career_dataset.csv')

In [21]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity,Career
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium,Software Engineer
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High,Marketing Manager
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low,Data Scientist
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High,Data Scientist
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium,Data Scientist


In [22]:
df.shape

(5000, 11)

In [23]:
df.Career.unique()

array(['Software Engineer', 'Marketing Manager', 'Data Scientist',
       'Graphic Designer', 'Doctor'], dtype=object)

In [24]:
df.Career.shape

(5000,)

### Feature Engineering

In [25]:
import pandas as pd
import random

# Define columns and categories
columns = [f"Feature_{i}" for i in range(1, 11)]
categories = {
    f"Feature_{i}": [f"Category_{i}_{j}" for j in range(1, 6)] for i in range(1, 11)
}

# Define a large set of career options (dependent feature)
career_options = [
    "Data Scientist", "ML Engineer", "AI Researcher", "Data Analyst", "Software Engineer",
    "Web Developer", "Blockchain Developer", "Cybersecurity Analyst", "DevOps Engineer",
    "Product Manager", "Cloud Architect", "UI/UX Designer", "Business Analyst", "NLP Engineer",
    "Computer Vision Engineer", "Database Administrator", "System Architect", "Embedded Systems Engineer",
    "IoT Specialist", "Game Developer", "Mobile App Developer", "Full Stack Developer", "Technical Writer",
    "QA Engineer", "Site Reliability Engineer", "AI Ethics Researcher", "Robotics Engineer", "Data Engineer",
    "Deep Learning Engineer", "Bioinformatics Scientist", "Network Engineer", "Solutions Architect",
    "Augmented Reality Developer", "Virtual Reality Developer", "Hardware Engineer", "Automation Engineer",
    "Research Scientist", "Speech Recognition Specialist", "Algorithm Engineer", "Financial Analyst"
]

# Generate synthetic data
data = []
num_samples = 5000  # Larger dataset

for _ in range(num_samples):
    row = {col: random.choice(categories[col]) for col in columns}
    row["Career"] = random.choice(career_options)
    data.append(row)

# Create DataFrame
df_large = pd.DataFrame(data)
df_large.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Career
0,Category_1_1,Category_2_2,Category_3_1,Category_4_1,Category_5_1,Category_6_1,Category_7_4,Category_8_1,Category_9_4,Category_10_4,Business Analyst
1,Category_1_1,Category_2_1,Category_3_1,Category_4_1,Category_5_3,Category_6_1,Category_7_4,Category_8_4,Category_9_4,Category_10_2,Site Reliability Engineer
2,Category_1_2,Category_2_4,Category_3_4,Category_4_2,Category_5_5,Category_6_1,Category_7_5,Category_8_5,Category_9_4,Category_10_1,Financial Analyst
3,Category_1_2,Category_2_4,Category_3_4,Category_4_5,Category_5_2,Category_6_4,Category_7_3,Category_8_3,Category_9_3,Category_10_2,Speech Recognition Specialist
4,Category_1_4,Category_2_5,Category_3_1,Category_4_1,Category_5_4,Category_6_3,Category_7_5,Category_8_4,Category_9_2,Category_10_5,Game Developer


In [26]:
df_large.Career.unique()

array(['Business Analyst', 'Site Reliability Engineer',
       'Financial Analyst', 'Speech Recognition Specialist',
       'Game Developer', 'Hardware Engineer', 'Web Developer',
       'Mobile App Developer', 'Product Manager', 'DevOps Engineer',
       'Technical Writer', 'Embedded Systems Engineer',
       'Software Engineer', 'Full Stack Developer', 'IoT Specialist',
       'Algorithm Engineer', 'Bioinformatics Scientist', 'UI/UX Designer',
       'Blockchain Developer', 'Cybersecurity Analyst',
       'Augmented Reality Developer', 'NLP Engineer', 'Data Analyst',
       'Solutions Architect', 'Computer Vision Engineer',
       'Data Scientist', 'AI Researcher', 'Automation Engineer',
       'QA Engineer', 'Database Administrator', 'Cloud Architect',
       'Research Scientist', 'Deep Learning Engineer',
       'AI Ethics Researcher', 'Data Engineer', 'Robotics Engineer',
       'Virtual Reality Developer', 'Network Engineer', 'ML Engineer',
       'System Architect'], dtype=objec

In [27]:
df_large.Career.shape

(5000,)

In [28]:
df.drop(['Career'], axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium


In [30]:
df['Career'] = df_large['Career']

In [31]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity,Career
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium,Business Analyst
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High,Site Reliability Engineer
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low,Financial Analyst
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High,Speech Recognition Specialist
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium,Game Developer


In [32]:
df.Career.unique()

array(['Business Analyst', 'Site Reliability Engineer',
       'Financial Analyst', 'Speech Recognition Specialist',
       'Game Developer', 'Hardware Engineer', 'Web Developer',
       'Mobile App Developer', 'Product Manager', 'DevOps Engineer',
       'Technical Writer', 'Embedded Systems Engineer',
       'Software Engineer', 'Full Stack Developer', 'IoT Specialist',
       'Algorithm Engineer', 'Bioinformatics Scientist', 'UI/UX Designer',
       'Blockchain Developer', 'Cybersecurity Analyst',
       'Augmented Reality Developer', 'NLP Engineer', 'Data Analyst',
       'Solutions Architect', 'Computer Vision Engineer',
       'Data Scientist', 'AI Researcher', 'Automation Engineer',
       'QA Engineer', 'Database Administrator', 'Cloud Architect',
       'Research Scientist', 'Deep Learning Engineer',
       'AI Ethics Researcher', 'Data Engineer', 'Robotics Engineer',
       'Virtual Reality Developer', 'Network Engineer', 'ML Engineer',
       'System Architect'], dtype=objec

In [33]:
df.shape

(5000, 11)

In [34]:
df.to_csv('data/career_dataset.csv', index=False)

In [2]:
import pandas as pd