# Researching the data

In [1]:
import pandas as pd

In [18]:
df = pd.read_csv('data/synthetic_career_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity,Career
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium,Software Engineer
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High,Marketing Manager
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low,Data Scientist
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High,Data Scientist
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium,Data Scientist


In [4]:
df.shape

(5000, 11)

In [5]:
df.Career.unique()

array(['Software Engineer', 'Marketing Manager', 'Data Scientist',
       'Graphic Designer', 'Doctor'], dtype=object)

In [6]:
df.Career.shape

(5000,)

### Feature Engineering

In [8]:
import pandas as pd
import random

# Define columns and categories
columns = [f"Feature_{i}" for i in range(1, 11)]
categories = {
    f"Feature_{i}": [f"Category_{i}_{j}" for j in range(1, 6)] for i in range(1, 11)
}

# Define a large set of career options (dependent feature)
career_options = [
    "Data Scientist", "ML Engineer", "AI Researcher", "Data Analyst", "Software Engineer",
    "Web Developer", "Blockchain Developer", "Cybersecurity Analyst", "DevOps Engineer",
    "Product Manager", "Cloud Architect", "UI/UX Designer", "Business Analyst", "NLP Engineer",
    "Computer Vision Engineer", "Database Administrator", "System Architect", "Embedded Systems Engineer",
    "IoT Specialist", "Game Developer", "Mobile App Developer", "Full Stack Developer", "Technical Writer",
    "QA Engineer", "Site Reliability Engineer", "AI Ethics Researcher", "Robotics Engineer", "Data Engineer",
    "Deep Learning Engineer", "Bioinformatics Scientist", "Network Engineer", "Solutions Architect",
    "Augmented Reality Developer", "Virtual Reality Developer", "Hardware Engineer", "Automation Engineer",
    "Research Scientist", "Speech Recognition Specialist", "Algorithm Engineer", "Financial Analyst"
]

# Generate synthetic data
data = []
num_samples = 5000  # Larger dataset

for _ in range(num_samples):
    row = {col: random.choice(categories[col]) for col in columns}
    row["Career"] = random.choice(career_options)
    data.append(row)

# Create DataFrame
df_large = pd.DataFrame(data)
df_large.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Career
0,Category_1_5,Category_2_5,Category_3_2,Category_4_1,Category_5_4,Category_6_5,Category_7_2,Category_8_4,Category_9_2,Category_10_1,Data Scientist
1,Category_1_4,Category_2_2,Category_3_3,Category_4_4,Category_5_4,Category_6_4,Category_7_5,Category_8_2,Category_9_5,Category_10_1,IoT Specialist
2,Category_1_1,Category_2_1,Category_3_3,Category_4_5,Category_5_3,Category_6_4,Category_7_1,Category_8_1,Category_9_2,Category_10_2,Product Manager
3,Category_1_2,Category_2_1,Category_3_2,Category_4_4,Category_5_3,Category_6_5,Category_7_2,Category_8_2,Category_9_4,Category_10_2,AI Researcher
4,Category_1_4,Category_2_2,Category_3_4,Category_4_3,Category_5_5,Category_6_1,Category_7_2,Category_8_1,Category_9_3,Category_10_1,Data Analyst


In [9]:
df_large.Career.unique()

array(['Data Scientist', 'IoT Specialist', 'Product Manager',
       'AI Researcher', 'Data Analyst', 'DevOps Engineer',
       'Computer Vision Engineer', 'Site Reliability Engineer',
       'Technical Writer', 'Data Engineer', 'Full Stack Developer',
       'Solutions Architect', 'Game Developer', 'Database Administrator',
       'Augmented Reality Developer', 'NLP Engineer',
       'Speech Recognition Specialist', 'ML Engineer', 'Network Engineer',
       'Business Analyst', 'Financial Analyst', 'Web Developer',
       'Deep Learning Engineer', 'Software Engineer',
       'Cybersecurity Analyst', 'Embedded Systems Engineer',
       'Research Scientist', 'Hardware Engineer', 'QA Engineer',
       'AI Ethics Researcher', 'Automation Engineer', 'Robotics Engineer',
       'Virtual Reality Developer', 'Cloud Architect',
       'Mobile App Developer', 'UI/UX Designer', 'System Architect',
       'Blockchain Developer', 'Bioinformatics Scientist',
       'Algorithm Engineer'], dtype=objec

In [10]:
df_large.Career.shape

(5000,)

In [11]:
df.drop(['Career'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium


In [13]:
df['Career'] = df_large['Career']

In [14]:
df.head()

Unnamed: 0,Interest,Skill_Level,Work_Style,Education,Personality,Problem_Solving,Tech,Communication,Leadership,Creativity,Career
0,Art,Expert,Flexible,Masters,Thinker,Creative,Very High,Excellent,Strong,Medium,Data Scientist
1,Technology,,Flexible,PhD,Thinker,Logical,,Poor,Moderate,Very High,IoT Specialist
2,Art,Intermediate,Team,PhD,Extrovert,Intuitive,,Excellent,Collaborative,Low,Product Manager
3,Business,Expert,Remote,Diploma,Extrovert,Intuitive,High,Fluent,,High,AI Researcher
4,Technology,Expert,Solo,Bachelors,Thinker,Creative,Very High,Excellent,Visionary,Medium,Data Analyst


In [15]:
df.Career.unique()

array(['Data Scientist', 'IoT Specialist', 'Product Manager',
       'AI Researcher', 'Data Analyst', 'DevOps Engineer',
       'Computer Vision Engineer', 'Site Reliability Engineer',
       'Technical Writer', 'Data Engineer', 'Full Stack Developer',
       'Solutions Architect', 'Game Developer', 'Database Administrator',
       'Augmented Reality Developer', 'NLP Engineer',
       'Speech Recognition Specialist', 'ML Engineer', 'Network Engineer',
       'Business Analyst', 'Financial Analyst', 'Web Developer',
       'Deep Learning Engineer', 'Software Engineer',
       'Cybersecurity Analyst', 'Embedded Systems Engineer',
       'Research Scientist', 'Hardware Engineer', 'QA Engineer',
       'AI Ethics Researcher', 'Automation Engineer', 'Robotics Engineer',
       'Virtual Reality Developer', 'Cloud Architect',
       'Mobile App Developer', 'UI/UX Designer', 'System Architect',
       'Blockchain Developer', 'Bioinformatics Scientist',
       'Algorithm Engineer'], dtype=objec

In [16]:
df.shape

(5000, 11)

In [17]:
df.to_csv('data/career_dataset.csv')