# Student Data Generator - Code for Colab


In [2]:
## Step 1: Install Packages
!pip install -q gradio transformers torch huggingface_hub pandas openai

In [3]:
# Install/Update bitsandbytes for quantization (latest version)
!pip install -U bitsandbytes accelerate

In [4]:
# imports

import gradio as gr
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import userdata
from huggingface_hub import login
from openai import OpenAI
import os

In [5]:
# Constants

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [6]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)



## Option 2: Use OpenAI for Transcription

In [7]:
# Sign in to OpenAI using Secrets in Colab


openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

# Model for text generation (not audio)
MODEL_NAME = "gpt-4o-mini"

print("✅ OpenAI setup complete")

In [8]:
# Generate Student Data Function

def generate_student_data(description, num_students=10, use_openai=True):
    """
    Generate synthetic student data based on description
    """
    import json
    import pandas as pd

    prompt = f"""Generate a dataset of {num_students} synthetic student records.

Description: {description}

For each student, provide:
- Full Name
- Email (matching the name, format: firstname.lastname@university.edu)
- Student ID (format: STU followed by 6 digits)
- Age (between 18-25)
- Major/Program
- GPA (between 2.0-4.0, with 2 decimal places)
- Year (Freshman, Sophomore, Junior, Senior)
- Enrollment Date (YYYY-MM-DD format between 2020-2024)

Return ONLY a JSON array, no other text. Format:
[
  {{
    "name": "John Doe",
    "email": "john.doe@university.edu",
    "student_id": "STU123456",
    "age": 20,
    "major": "Computer Science",
    "gpa": 3.75,
    "year": "Sophomore",
    "enrollment_date": "2022-09-01"
  }}
]
"""

    try:
        if use_openai and openai:
            # Use OpenAI (faster)
            response = openai.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a data generator. Return only valid JSON array, no explanations or markdown."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7
            )
            result = response.choices[0].message.content

        else:
            # Use Hugging Face (LLAMA model)
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4"
            )

            tokenizer = AutoTokenizer.from_pretrained(LLAMA)
            tokenizer.pad_token = tokenizer.eos_token

            model = AutoModelForCausalLM.from_pretrained(
                LLAMA,
                device_map="auto",
                quantization_config=quantization_config
            )

            messages = [
                {"role": "system", "content": "You are a data generator. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ]

            inputs = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_new_tokens=2000,
                    temperature=0.7,
                    do_sample=True
                )

            result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

        # Extract JSON from response
        start_idx = result.find('[')
        end_idx = result.rfind(']') + 1

        if start_idx != -1 and end_idx > start_idx:
            json_str = result[start_idx:end_idx]
            data = json.loads(json_str)
            df = pd.DataFrame(data)
            return df
        else:
            return pd.DataFrame({"Error": ["Could not parse JSON from response"]})

    except Exception as e:
        return pd.DataFrame({"Error": [f"Error generating data: {str(e)}"]})


In [9]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [11]:
# Gradio Interface for Student Data Generator

def gradio_generate(description, num_students, use_openai_choice):
    """Wrapper function for Gradio"""
    try:
        num = int(num_students) if num_students else 10
        use_ai = use_openai_choice if openai else False
        df = generate_student_data(description, num_students=num, use_openai=use_ai)
        return df
    except Exception as e:
        return pd.DataFrame({"Error": [str(e)]})

# Create Gradio interface
demo = gr.Interface(
    fn=gradio_generate,
    inputs=[
        gr.Textbox(
            label="Describe the student data you want",
            placeholder="e.g., Computer Science students with high GPAs, Engineering majors from 2023, Business students with internships",
            lines=3
        ),
        gr.Number(
            label="Number of students",
            value=10,
            minimum=1,
            maximum=50
        ),
        gr.Checkbox(
            label="Use OpenAI (faster) - uncheck to use Hugging Face LLAMA model",
            value=True if openai else False
        )
    ],
    outputs=gr.Dataframe(
        label="Generated Student Data",
        interactive=True
    ),
    title="🎓 Synthetic Student Data Generator",
    description="Generate realistic student data for testing and development. Describe what kind of students you want!",
    examples=[
        ["Computer Science students with high GPAs", 10, True],
        ["Business majors from 2022", 15, True],
        ["Engineering students with internships", 5, True],
        ["Freshman students enrolled in 2024", 20, True]
    ]
)

# Launch the interface
demo.launch(share=True)