In [5]:
import ijson
import pandas as pd
from collections import defaultdict

def load_filtered_events(data_path, event_names=['problem_passed', 'problem_run', 'slide_steps_complete']):
    """
    Loads and filters events from a JSON file using streaming parser.

    Parameters:
    - data_path (str): Path to the JSON file (expects array of JSON objects)
    - event_names (list): List of event names to include (default: ['problem_passed', 'problem_run'])

    Returns:
    - pd.DataFrame: Filtered DataFrame with matching events
    """
    filtered_entries = []

    with open(data_path, 'r') as f:
        for obj in ijson.items(f, 'item'):
            if 'event_name' in obj and obj['event_name'] in event_names:
                filtered_entries.append(obj)

    return pd.DataFrame(filtered_entries)

In [6]:
df = load_filtered_events('data/events.json')

In [7]:
import pandas as pd
import json

def label_slide_types(df, problems_json_path):
    """
    Adds a 'slide_type' column to the DataFrame, indicating whether the (challenge, problem, slide)
    corresponds to a problem_slide or an event_slide.

    Parameters:
    - df: DataFrame containing at least 'challenge', 'problem', 'slide'
    - problems_json_path: Path to the JSON file with problem slide metadata

    Returns:
    - The input DataFrame with an added 'slide_type' column
    """
    
    # Load the problem metadata
    with open(problems_json_path, "r") as f:
        problems = json.load(f)
    
    problems_df = pd.DataFrame(problems)

    # Build a set of (course_slug, coursemodule_slug, slide_no)
    problem_slide_keys = set(
        zip(
            problems_df["course_slug"],
            problems_df["coursemodule_slug"],
            problems_df["slide_no"]
        )
    )

    # Ensure slide is treated as integer-compatible (nullable Int)
    df["slide"] = df["slide"].astype("Int64")

    # Define function for row-wise check
    def get_slide_type(row):
        key = (row["challenge"], row["problem"], row["slide"])
        return "problem_slide" if key in problem_slide_keys else "event_slide"

    # Apply and return
    df["slide_type"] = df.apply(get_slide_type, axis=1)
    return df


In [8]:
import json
import re

# Function to extract challenge and problem from the JSON string
def extract_challenge_problem(event_data_str):
    try:
        event_data = json.loads(event_data_str)
        url = event_data.get("url", "")
        # Extract using regex
        challenge_match = re.search(r'learn\/([^\/]+)', url)
        problem_match = re.search(r'learn\/[^\/]+\/([^\/]+)', url)
        slide_match = re.search(r'learn\/[^\/]+\/[^\/]+\/([^\/]+)', url)


        challenge = challenge_match.group(1) if challenge_match else None
        problem = problem_match.group(1) if problem_match else None
        slide = slide_match.group(1) if slide_match else None
        
        return challenge, problem, slide
    except (json.JSONDecodeError, AttributeError):
        return None, None, None


 

In [9]:
# Apply to the dataframe
df['challenge'], df['problem'], df['slide'] = zip(*df['event_data'].apply(extract_challenge_problem))

In [10]:
df = label_slide_types(df, "data/problems.json")

In [11]:
df.to_csv("data/processed_data.csv")