Load the Data

In [None]:
import pandas as pd

data = pd.read_csv("MOCK_DATA.csv")

Remove Non-Predictive Columns

In [None]:
data = data.drop(['student_id', 'name'], axis=1)

Handle Categorical Features

In [None]:
# converts yes or no to boolean
data['interest_stem'] = data['interest_stem'].apply(lambda x: 1 if x == "Yes" else 0)
data['interest_arts'] = data['interest_arts'].apply(lambda x: 1 if x == "Yes" else 0)


# encode gender
data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})


# encode socio-economic status
data = pd.get_dummies(data, columns=['socioeconomic_status'], drop_first=True)


Defining the Target Variable

In [None]:
# Add a mock target variable based on existing features
# This is just an example; customize based on your domain knowledge
def assign_career(row):
    if row['interest_stem'] == 1 and row['math_grade'] > 0.8:
        return "Engineering"
    elif row['interest_arts'] == 1:
        return "Arts"
    else:
        return "Business"

data['career_path'] = data.apply(assign_career, axis=1)


Spliting the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = data.drop('career_path', axis=1)
y = data['career_path']

# Split into training and testing sets (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Check for Missing Values

In [None]:
# Check for missing values
print(data.isnull().sum())
