# Create the Synthetic Data

In [None]:
# Libraries 
import pandas as pd
import random
from datetime import datetime, timedelta

## Set the variables (this can change as needed)

In [None]:
# Define constants
NUM_QUIZZES = 15
NUM_STUDENTS_PER_QUIZ = 20
QUESTIONS_PER_QUIZ = 25
NUM_SUBJECTS = 7
TOTAL_STUDENTS = NUM_QUIZZES * NUM_STUDENTS_PER_QUIZ
NUM_UNIVERSITIES = 5

## Some of the rules surrounding the breakdowns

In [None]:
# Possible subjects and their corresponding sections
subjects = ["Math", "Science", "History", "English", "Economics", "Physics", "Biology"]
sections = {subject: [f"{subject} Section {i+1}" for i in range(10)] for subject in subjects}

# Possible majors
majors = ["Math", "Science", "History", "English", "Economics", "Engineering", "Biology"]

# Possible universities
universities = ["University A", "University B", "University C", "University D", "University E"]

# Generate unique students
students = [str(10000 + i) for i in range(TOTAL_STUDENTS)]  # Unique IDs for all students

# Generate quiz dates
start_date = datetime(2024, 1, 1)
quiz_dates = [start_date + timedelta(days=7*i) for i in range(NUM_QUIZZES)]

## Build the dataset

In [1]:
# Generate dataset
records = []
for quiz_id in range(1, NUM_QUIZZES + 1):
    quiz_students = random.sample(students, NUM_STUDENTS_PER_QUIZ)  # Select unique students per quiz
    for idx, student_id in enumerate(quiz_students):
        grade = random.randint(1, 4)  # Freshman to Senior
        major = random.choice(majors)
        university = universities[idx % NUM_UNIVERSITIES]  # Evenly distribute students across universities
        for question_num in range(1, QUESTIONS_PER_QUIZ + 1):
            subject = random.choice(subjects)
            section = random.choice(sections[subject])
            correct = random.choice([True, False])  # Random correctness
            date = quiz_dates[quiz_id - 1].strftime('%Y-%m-%d')
            
            records.append([
                student_id, 
                quiz_id, 
                f"{quiz_id}-{question_num}", 
                date, 
                subject, 
                section, 
                major, 
                grade, 
                university,
                correct
            ])

# Create DataFrame
df = pd.DataFrame(records, columns=[
    "Student ID", "Quiz ID", "Question ID", "Date", "Subject", 
    "Section of said Subject", "Major", "Grade of Student", "University", "Correct"
])

df

Unnamed: 0,Student ID,Quiz ID,Question ID,Date,Subject,Section of said Subject,Major,Grade of Student,University,Correct
0,10108,1,1-1,2024-01-01,History,History Section 10,History,1,University A,False
1,10108,1,1-2,2024-01-01,Biology,Biology Section 9,History,1,University A,True
2,10108,1,1-3,2024-01-01,Biology,Biology Section 5,History,1,University A,False
3,10108,1,1-4,2024-01-01,Biology,Biology Section 4,History,1,University A,False
4,10108,1,1-5,2024-01-01,English,English Section 2,History,1,University A,False
...,...,...,...,...,...,...,...,...,...,...
7495,10241,15,15-21,2024-04-08,Biology,Biology Section 7,English,3,University E,True
7496,10241,15,15-22,2024-04-08,History,History Section 10,English,3,University E,False
7497,10241,15,15-23,2024-04-08,History,History Section 8,English,3,University E,False
7498,10241,15,15-24,2024-04-08,Economics,Economics Section 5,English,3,University E,True


## Output the dataset

In [2]:
df.to_csv('student_data.csv', index=False)