# Course Engagement Data Exploration

This notebook demonstrates how to connect to the DuckDB database and explore the course engagement data.

In [1]:
import pandas as pd
import duckdb
from pathlib import Path

# Database connection - DuckDB file in project root
DB_FILE = Path('../mock_data.duckdb')
conn = duckdb.connect(str(DB_FILE))

print(f"Connected to DuckDB at {DB_FILE}")
print("Database ready for queries!")

Connected to DuckDB at ../mock_data.duckdb
Database ready for queries!


## Query Methods

You can query DuckDB using the connection object:
- `conn.execute(query).df()` - Returns a pandas DataFrame
- `conn.execute(query).fetchall()` - Returns raw results
- `conn.sql(query)` - Alternative syntax

### 1. Explore Raw Users

In [2]:
conn.execute("SELECT * FROM raw.users LIMIT 10;").df()

Unnamed: 0,id,fullName,email,signupDate,state,isGovEmployee,updatedAt,deleted
0,1,Jane Doe,jane@example.com,2023-01-02,active,True,2023-03-01 12:00:00,
1,1,Jane Doe,jane@example.com,2023-01-02,active,True,2023-02-15 10:00:00,
2,2,John Smith,john@example.com,2023-01-10,suspended,False,2023-02-01 09:00:00,
3,3,Alice Wong,alice@example.com,2023-01-05,active,True,2023-02-20 08:30:00,
4,4,Bob Lee,bob@example.com,2023-01-05,deleted,False,2023-01-15 10:00:00,True


### 2. Explore Courses

In [3]:
enrolments_df = conn.execute("""
    SELECT 
        e.*,
        u.fullname,
        c.title as course_title
    FROM raw.enrolments e
    JOIN raw.users u ON e.user_id = u.id
    JOIN raw.courses c ON e.course_id = c.course_id
""").df()

enrolments_df

Unnamed: 0,enrolment_id,user_id,course_id,enrolled_at,status,fullName,course_title
0,5001,1,101,2023-03-01 09:00:00,active,Jane Doe,Leadership Basics
1,5002,1,102,2023-03-05 10:00:00,completed,Jane Doe,Advanced Policy
2,5003,2,101,2023-03-02 11:00:00,active,John Smith,Leadership Basics
3,5004,3,103,2023-03-07 12:00:00,active,Alice Wong,Wellbeing at Work
4,5001,1,101,2023-03-01 09:00:00,active,Jane Doe,Leadership Basics
5,5002,1,102,2023-03-05 10:00:00,completed,Jane Doe,Advanced Policy


### 3. Explore Events

In [4]:
conn.execute("""
    SELECT 
        event_type,
        COUNT(*) as count
    FROM raw.events
    GROUP BY event_type
    ORDER BY count DESC;
""").df()

Unnamed: 0,event_type,count
0,video_start,2
1,other,1
2,quiz_start,1
3,video_complete,1
4,quiz_submit,1


### 4. Data Quality Checks

In [5]:
# Check for duplicate users
duplicate_check = conn.execute("""
    SELECT 
        id,
        COUNT(*) as record_count
    FROM raw.users
    WHERE deleted IS NULL OR deleted = FALSE
    GROUP BY id
    HAVING COUNT(*) > 1;
""").df()

print(f"Users with duplicate records: {len(duplicate_check)}")
duplicate_check

Users with duplicate records: 1


Unnamed: 0,id,record_count
0,1,2


In [6]:
# Check for orphaned enrolments (no matching user or course)
orphaned_check = conn.execute("""
    SELECT 
        'Missing User' as issue_type,
        COUNT(*) as count
    FROM raw.enrolments e
    LEFT JOIN raw.users u ON e.user_id = u.id
    WHERE u.id IS NULL
    
    UNION ALL
    
    SELECT 
        'Missing Course' as issue_type,
        COUNT(*) as count
    FROM raw.enrolments e
    LEFT JOIN raw.courses c ON e.course_id = c.course_id
    WHERE c.course_id IS NULL;
""").df()

orphaned_check

Unnamed: 0,issue_type,count
0,Missing User,0
1,Missing Course,0


## Next Steps

Use this notebook to:
1. Explore the raw data and understand its structure
2. Test SQL transformations before implementing them in dbt
3. Validate your dbt models after they run
4. Perform ad-hoc analysis

## Messy Query

In [7]:
# Check for orphaned enrolments (no matching user or course)
messy_query = conn.execute("""
WITH users_cleaned AS (
    SELECT
        u.id AS userId,
        u.fullName,
        list_element(string_split(u.fullName, ' '), 1) AS firstName,
        list_element(string_split(u.fullName, ' '), 2) AS lastName,
        u.email AS EmailAddress,
        u.signupDate,
        u.state AS user_state,
        u.signupDate,  -- duplicate column
        COALESCE(u.isGovEmployee, FALSE) AS isGovEmployee,  -- inconsistent boolean casting
        ROW_NUMBER() OVER (PARTITION BY u.id ORDER BY u.updatedAt DESC) AS rn
    FROM
        raw.users AS u
    WHERE
        u.deleted IS NULL OR u.deleted = FALSE
),

deduped_users AS (
    SELECT *
    FROM users_cleaned
    WHERE rn = 1
),

course_stuff AS (
    SELECT
        c.course_id,
        c.title,
        c.category_name,
        CASE WHEN c.category_name = '' THEN NULL ELSE c.category_name END AS cat_clean,  -- pointless
        c.course_created_at,
        c.course_created_at AS created,
        c.course_created_at,  -- duplicate column
        c.level,
        c.level AS courseLevel,
        c.publisher,
        c.publisher AS pub
    FROM raw.courses AS c
),

enrols AS (
    -- duplicate logic repeated later in events section
    SELECT
        e.enrolment_id,
        e.user_id AS uID,
        e.course_id,
        e.enrolled_at,
        e.status,
        e.status AS enrol_status,
        CAST(e.enrolled_at AS DATE) AS enrol_date,
        COALESCE(e.enrolled_at, current_timestamp) AS enrol_ts
    FROM raw.enrolments e
),

events AS (
    SELECT
        ev.id,
        ev.user_id,
        ev.course_id,
        ev.event_type,
        ev.event_timestamp,
        CAST(ev.event_timestamp AS DATE) AS event_date,
        CASE
            WHEN event_type IN ('video_start', 'video_complete') THEN 'video'
            WHEN event_type IN ('quiz_start', 'quiz_submit') THEN 'quiz'
            ELSE 'other'
        END AS event_group,
        ev.session_id,
        ev.metadata,
        list_element(string_split(ev.metadata, ':'), 2) AS meta_value  -- nonsense parsing
    FROM raw.events ev
),

-- Unused CTE
pointless_cte AS (
    -- DuckDB doesn't support EXCEPT syntax, so we list columns manually
    SELECT
        id, user_id, course_id, event_type, event_timestamp,
        event_date, event_group, session_id, metadata
    FROM events
),

combined AS (
    SELECT
        u.userId,
        u.fullName,
        u.EmailAddress,
        u.isGovEmployee,
        u.user_state,
        e.course_id,
        c.title,
        c.category_name,
        e.enrolment_id,
        e.enrolled_at,
        e.status,
        ev.event_type,
        ev.event_group,
        ev.event_timestamp,
        ev.session_id,
        ev.meta_value,

        -- redundant CASE logic
        CASE
            WHEN ev.event_type = 'video_start' THEN 1
            WHEN ev.event_type = 'video_complete' THEN 1
            WHEN ev.event_type = 'quiz_start' THEN 1
            WHEN ev.event_type = 'quiz_submit' THEN 1
            ELSE 0
        END AS engagement_flag,

        -- weird metric duplication
        CASE
            WHEN ev.event_type IN ('quiz_submit') THEN 1 ELSE 0 END AS completed_quiz,
        CASE
            WHEN ev.event_type IN ('video_complete') THEN 1 ELSE 0 END AS completed_video
    FROM deduped_users u
    LEFT JOIN enrols e ON e.uID = u.userId
    LEFT JOIN course_stuff c ON c.course_id = e.course_id
    LEFT JOIN events ev ON ev.user_id = u.userId AND ev.course_id = e.course_id
),

aggregated AS (
    SELECT
        course_id,
        title,
        COUNT(DISTINCT userId) AS learners,
        COUNT(DISTINCT CASE WHEN engagement_flag = 1 THEN userId END) AS active_learners,
        SUM(completed_quiz) AS total_quizzes_completed,
        SUM(completed_video) AS total_videos_completed,
        COUNT(*) AS total_events,   -- incorrect grain: row explosion from joins
        approx_count_distinct(session_id) AS approx_sessions,
        MIN(event_timestamp) AS first_activity,
        MAX(event_timestamp) AS last_activity
    FROM combined
    GROUP BY 1,2
)

SELECT *
FROM aggregated
ORDER BY learners DESC;

""").df()

messy_query

Unnamed: 0,course_id,title,learners,active_learners,total_quizzes_completed,total_videos_completed,total_events,approx_sessions,first_activity,last_activity
0,101,Leadership Basics,2,2,0.0,1.0,3,2,2023-03-01 09:05:00,2023-03-02 11:05:00
1,102,Advanced Policy,1,1,1.0,0.0,2,1,2023-03-06 10:30:00,2023-03-06 10:35:00
2,103,Wellbeing at Work,1,0,0.0,0.0,1,1,2023-03-07 12:30:00,2023-03-07 12:30:00
