In [None]:
import numpy as np
import pandas as pd
import os

# New cell (index 0) - generate synthetic student dataset (~25,000 records)
np.random.seed(42)
n = 25000

# Generate Student ID
ids = [f"SID{str(i+1).zfill(6)}" for i in range(n)]

# The key demographic variables are race, ethnicity, age group, gender, and student type.
# The probabilites would have been determined from the original student population data. Below are hypothetical probabilities to generate the synthetic dataset.
# You will noticed some variables are not missing or unknown, this is intentional to simulate real-world data as some students may choose not to disclose certain information.

# Race Labels and Probabilities
races = [
    "American Indian or Alaska Native",
    "Asian",
    "Black or African American",
    "Native Hawaiian or Other Pacific Islander",
    "White",
    "Two or more races",
    "Unknown",
]
race_probs = [0.03, 0.10, 0.19, 0.03, 0.55, 0.07, 0.03]
race = np.random.choice(races, size=n, p=race_probs)

# Ethnicity Labels and Probabilities
ethnicities = [ "Hispanic or Latino","Not Hispanic or Latino", "Unknown"]
eth_probs = [0.25, 0.75, 0.00]
ethnicity = np.random.choice(ethnicities, size=n, p=eth_probs)

# Gender Labels and Probabilities
# Note. Unknown is used to simulate non-disclosure by some students who select 'Prefer not to say' during data collection.
genders = ["Female", "Male", "Unknown"]
gender_probs = [0.57, 0.42, 0.01]
gender = np.random.choice(genders, size=n, p=gender_probs)

# Student Type Labels and Probabilities
student_types = [
    "Freshman",
    "Sophomore",
    "Junior",
    "Senior",
    "Continuing Education"
]
student_types_probs = [0.29, 0.20, 0.18, 0.23, 0.10]
student_type = np.random.choice(student_types, size=n, p=student_types_probs)

# Age Labels and Probabilities
# Age is conditional on the student type in a traditional 5 level classification in a college setting.
age = np.empty(n, dtype=int)
# The age distribution is using a range of realistic age, the mean (loc) and standard deviation (scale) are set accordingly.
for i, st in enumerate(student_type):
    if st == "Freshman":
        age[i] = int(np.clip(np.random.normal(loc=18, scale=1.0), 17, 18))
    elif st == "Sophomore":
        age[i] = int(np.clip(np.random.normal(loc=19, scale=1.2), 18, 20))
    elif st == "Junior":
        age[i] = int(np.clip(np.random.normal(loc=20, scale=1.5), 19, 21))
    elif st == "Senior":
        age[i] = int(np.clip(np.random.normal(loc=21, scale=1.0), 20, 24))
    else:  # Continuing Education
        age[i] = int(np.clip(np.random.normal(loc=25, scale=6.0), 18, 65))


# Overall GPA Labels and Probabilities
# This information is added to show the complexity of having more variables in the dataset.

# Overall GPA is kept between 0.0-4.0 with different distributions based on student type.
# There is no GPA for Continuing Education students in this synthetic dataset.
gpa = np.empty(n, dtype=float)
for i, st in enumerate(student_type):
    if st == "Freshman":
        g = np.random.normal(loc=3.00, scale=0.60)
    elif st == "Sophomore":
        g = np.random.normal(loc=3.05, scale=0.50)
    elif st == "Junior":
        g = np.random.normal(loc=3.15, scale=0.40)
    elif st == "Senior":
        g = np.random.normal(loc=3.30, scale=0.20)
    else:  # Continuing Education
        g = 0.0  # No GPA for Continuing Education students
    g = np.clip(g, 0.0, 4.0)
    gpa[i] = round(g, 2)


# Assemble DataFrame
df = pd.DataFrame({
    "StudentID": ids,
    "Gender": gender,
    "Age": age,
    "Race": race,
    "Ethnicity": ethnicity,
    "StudentType": student_type,
    "OverallGPA": gpa,
})

# Save to CSV and display top rows
save_dir = os.path.expanduser("~/Data Files")
os.makedirs(save_dir, exist_ok=True)
#df.to_csv(os.path.join(save_dir, "synthetic_students.csv"), index=False)
df.head(10)