# ALGA-Ed Notebook
Includes dataset loading, preprocessing, synthetic data generation, RL training, DALL·E image generation, Whisper transcription, and visualizations.

In [None]:
!pip install openai transformers torch pandas numpy matplotlib seaborn scikit-learn

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import openai
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from IPython.display import Image, display
import os


In [None]:
# Set your OpenAI API key
openai.api_key = 'YOUR_OPENAI_API_KEY'

## Load and Preprocess EdNet Dataset

In [None]:

def preprocess_ednet(file_path='ednet_data/KT1/problem_log.csv'):
    if not os.path.exists(file_path):
        print("EdNet file not found.")
        return pd.DataFrame()

    df = pd.read_csv(file_path)
    df = df.dropna(subset=['elapsed_time', 'correct'])
    df['elapsed_time'] = df['elapsed_time'].clip(0, 60000)
    scaler = MinMaxScaler()
    df[['elapsed_time']] = scaler.fit_transform(df[['elapsed_time']])
    user_stats = df.groupby('user_id')['correct'].mean().reset_index()
    user_stats.columns = ['user_id', 'accuracy']
    df = df.merge(user_stats, on='user_id')
    df['error_rate'] = 1 - df['accuracy']
    return df[['elapsed_time', 'accuracy', 'error_rate', 'correct']]

# ednet_data = preprocess_ednet()


##  Load and Preprocess ASSISTments Dataset

In [None]:

def preprocess_assistments(file_path='assistments_2009.csv'):
    if not os.path.exists(file_path):
        print("ASSISTments file not found.")
        return pd.DataFrame()

    df = pd.read_csv(file_path)
    df = df.dropna(subset=['hint_count', 'correct'])
    df['hint_count'] = df['hint_count'].clip(0, 10)
    df['time'] = df['time'].fillna(100).clip(0, 10000)
    scaler = MinMaxScaler()
    df[['hint_count', 'time']] = scaler.fit_transform(df[['hint_count', 'time']])
    student_stats = df.groupby('student_id')['correct'].mean().reset_index()
    student_stats.columns = ['student_id', 'accuracy']
    df = df.merge(student_stats, on='student_id')
    df['error_rate'] = 1 - df['accuracy']
    return df[['hint_count', 'time', 'accuracy', 'error_rate', 'correct']]

# assist_data = preprocess_assistments()


## Synthetic Dataset for Disabled Students

In [None]:

def generate_synthetic_disabled_data(n_samples=1000):
    np.random.seed(42)
    profiles = []
    for _ in range(n_samples):
        disability = np.random.choice(['ADHD', 'dyslexia', 'visual_impairment'], p=[0.4, 0.4, 0.2])
        if disability == 'ADHD':
            profile = {'elapsed_time': np.random.uniform(0.6, 1.0),
                       'error_rate': np.random.uniform(0.3, 0.6),
                       'hint_usage': np.random.uniform(0.2, 0.5),
                       'disability': 'ADHD'}
        elif disability == 'dyslexia':
            profile = {'elapsed_time': np.random.uniform(0.7, 1.0),
                       'error_rate': np.random.uniform(0.4, 0.7),
                       'hint_usage': np.random.uniform(0.4, 0.7),
                       'disability': 'dyslexia'}
        else:
            profile = {'elapsed_time': np.random.uniform(0.8, 1.0),
                       'error_rate': np.random.uniform(0.2, 0.4),
                       'hint_usage': np.random.uniform(0.6, 0.9),
                       'disability': 'visual_impairment'}
        profiles.append(profile)
    return pd.DataFrame(profiles)

synthetic_df = generate_synthetic_disabled_data()
synthetic_df.head()


## Visualizations of Engagement by Disability

In [None]:

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.boxplot(x='disability', y='elapsed_time', data=synthetic_df, palette='Set2')
plt.title('Elapsed Time by Disability')

plt.subplot(2, 2, 2)
sns.boxplot(x='disability', y='error_rate', data=synthetic_df, palette='Set3')
plt.title('Error Rate by Disability')

plt.subplot(2, 2, 3)
sns.boxplot(x='disability', y='hint_usage', data=synthetic_df, palette='Set1')
plt.title('Hint Usage by Disability')

plt.tight_layout()
plt.suptitle('Engagement Profiles by Disability', fontsize=16, y=1.03)
plt.show()


## GPT2 Text Generation

In [None]:

class TextGenerator:
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")

    def generate(self, prompt):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(inputs['input_ids'], max_length=100)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

tg = TextGenerator()
print(tg.generate("Explain gravity to a 10-year-old"))


## PyTorch RL Agent and Training

In [None]:

class FeedbackAgent(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(FeedbackAgent, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        return self.fc(x)

agent = FeedbackAgent(3, 2)
optimizer = optim.Adam(agent.parameters(), lr=0.001)
criterion = nn.MSELoss()

data = [
    ([0.6, 0.2, 0.8], [0.7, 0.3]),
    ([0.9, 0.1, 0.9], [0.95, 0.05]),
    ([0.4, 0.3, 0.5], [0.5, 0.5]),
    ([0.2, 0.5, 0.3], [0.3, 0.7]),
]

for epoch in range(100):
    total_loss = 0
    for state_vals, target_vals in data:
        state = torch.tensor(state_vals, dtype=torch.float32)
        target = torch.tensor(target_vals, dtype=torch.float32)
        output = agent(state)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

print("Test output:", agent(torch.tensor([0.8, 0.2, 0.9])).detach().numpy())


## DALL·E Image Generation

In [None]:

response = openai.Image.create(
  prompt="high contrast diagram of the solar system for visually impaired students",
  n=1,
  size="512x512"
)
image_url = response['data'][0]['url']
display(Image(url=image_url))


## Whisper Audio Transcription

In [None]:

from google.colab import files
uploaded = files.upload()

for fname in uploaded.keys():
    with open(fname, "rb") as audio_file:
        transcript = openai.Audio.transcribe("whisper-1", audio_file)
        print(f"Transcript for {fname}:", transcript["text"])


## Test Cases for ALGA-Ed System

In [None]:

import pytest
import pandas as pd
import numpy as np

# Dummy functions (to be replaced by actual implementation functions)
def preprocess_data(df): return df.fillna(0)
def normalize_features(df): return (df - df.min()) / (df.max() - df.min())
def simulate_ednet_data(n): return pd.DataFrame({'response_time': np.random.rand(n), 'correct': np.random.randint(0, 2, n), 'attempts': np.random.randint(1, 4, n)})
def extract_features(df): return df[['response_time', 'correct', 'attempts']]
def generate_synthetic_data(num_samples): return pd.DataFrame({'disability_type': np.random.choice(['ADHD', 'dyslexia'], num_samples)})
def compute_reward(old, new): return max(0, new - old)
def rl_policy(state): return int(np.argmax(state))
def update_profile(profile, feedback): profile.update(feedback); return profile
def simplify_text(text): return "Simple: " + text.split('.')[0]
def generate_educational_visual(prompt): return "Generated_Image_URL"

# Test cases
def test_missing_values_handling():
    df = pd.DataFrame({"score": [85, np.nan, 70]})
    processed = preprocess_data(df)
    assert not processed.isnull().values.any()

def test_normalization_range():
    df = pd.DataFrame({"engagement_time": [10, 20, 30]})
    norm_df = normalize_features(df)
    assert norm_df["engagement_time"].between(0, 1).all()

def test_feature_shape():
    raw_data = simulate_ednet_data(10)
    features = extract_features(raw_data)
    assert features.shape[0] == 10

def test_feature_columns_exist():
    features = extract_features(simulate_ednet_data(5))
    expected = {"response_time", "correct", "attempts"}
    assert expected.issubset(set(features.columns))

def test_synthetic_behavior_distribution():
    data = generate_synthetic_data(num_samples=100)
    assert data['disability_type'].nunique() > 1

def test_synthetic_data_types():
    data = generate_synthetic_data(20)
    assert data.select_dtypes(include=['number']).shape[1] >= 0

def test_reward_positive_for_improvement():
    assert compute_reward(50, 70) > 0

def test_rl_policy_returns_action():
    assert isinstance(rl_policy(np.array([0.2, 0.5, 0.7])), int)

def test_feedback_updates_profile():
    profile = {"level": "intermediate"}
    updated = update_profile(profile, {"mistakes": 3})
    assert "mistakes" in updated

def test_text_simplification():
    assert len(simplify_text("Photosynthesis is a biochemical process.").split()) < 10

def test_image_generation():
    assert generate_educational_visual("basic algebra") is not None

def test_pipeline_flow():
    raw = simulate_ednet_data(20)
    pre = preprocess_data(raw)
    feat = extract_features(pre)
    result = {"accuracy": feat['correct'].mean()}
    assert result["accuracy"] >= 0

print("All test cases defined. Run using pytest or call individually.")
