In [51]:
import pandas as pd
import numpy as np

In [52]:
df = pd.read_csv('data/processed_data.csv')

In [53]:
df = df[df["challenge"]!="challenge-advanced-2018"]

In [54]:
# How many people make it to the second last and last problem in each module (both required)
def get_progression_summary(df):
    challenges = pd.unique(df["challenge"])
    results = []
    for challenge in challenges:
        df_chal = df[df["challenge"] == challenge]
        modules = pd.unique(df_chal["problem"])
    
        for module in modules:
            df_module = df_chal[(df_chal["problem"] == module) & (df_chal["slide_type"] == "problem_slide")]
            total_users = len(df_module["user_id"].unique())
            slides = sorted(df_module["slide"].unique())

            # Ensure there are at least two slides for second-last and last
            if len(slides) >= 2:
                last_slide = slides[-1]
                second_last_slide = slides[-2]

                # Users who ran the last slide
                ran_last_slide_users = set(
                    df_module[
                        (df_module["slide"] == last_slide) & 
                        (df_module["event_name"] == "problem_run")
                    ]["user_id"].unique()
                )

                # Users who ran the second-last slide
                ran_second_last_slide_users = set(
                    df_module[
                        (df_module["slide"] == second_last_slide) & 
                        (df_module["event_name"] == "problem_run")
                    ]["user_id"].unique()
                )

                # Only users who did both
                ran_both_users = ran_second_last_slide_users & ran_last_slide_users

                total_ran_second_last_slide = len(ran_second_last_slide_users)
                total_ran_last_slide = len(ran_last_slide_users)
                total_ran_both = len(ran_both_users)
            
                results.append({
                    "challenge": challenge,
                    "module": module,
                    "unique_users": total_users,
                    "ran_second_last_slide": total_ran_second_last_slide,
                    "ran_last_slide": total_ran_last_slide,
                    "ran_both": total_ran_both,
                    "pct_ran_second_last_slide": round((total_ran_second_last_slide / total_users) * 100, 2),
                    "pct_ran_last_slide": round((total_ran_both / total_ran_second_last_slide) * 100, 2)  # Based only on those who did 2nd last
                })

    # Convert to DataFrame for pretty output
    prog_summary = pd.DataFrame(results)
    return prog_summary


In [55]:
prog_summary = get_progression_summary(df)

In [56]:
def get_avg_progression_by_challenge(prog_summary):
    """
    For a given dataframe of user events with slide_type and event_name,
    computes the average progression stats per challenge based on problem slides.
    """

    # Then group by challenge and average the progression percentages
    challenge_averages = (
        prog_summary
        .groupby("challenge")[["pct_ran_second_last_slide", "pct_ran_last_slide"]]
        .mean()
        .round(2)
        .reset_index()
    )

    return challenge_averages


In [57]:
challenge_averages = get_avg_progression_by_challenge(prog_summary)

In [58]:
challenge_averages

Unnamed: 0,challenge,pct_ran_second_last_slide,pct_ran_last_slide
0,challenge-beginners-2018,88.26,94.63
1,challenge-beginners-blockly-2018,78.42,89.99
2,challenge-intermediate-2018,95.61,91.94
3,challenge-newbies-2018,83.33,91.87


In [59]:
# What is the time gap between the first visit of the final problem and the last visit of the preceding problem
def get_time_gap_summary(df):
    # df = df[(df["slide_type"] == "problem_slide") & ((df["event_name"] == "problem_run") | (df["event_name"] == "slide_steps_complete"))]
    df = df[((df["event_name"] == "problem_run") | (df["event_name"] == "slide_steps_complete"))]
    challenges = pd.unique(df["challenge"])
    results = []

    for challenge in challenges:
        df_chal = df[df["challenge"]==challenge]
        modules = pd.unique(df_chal["problem"])
        for module in modules:
            time_gaps = []
            df_module = df_chal[(df_chal["problem"] == module)]
            df_module = df_module.sort_values(by="created_at")
            slides = sorted(df_module["slide"].unique())
            
            if len(slides) >= 2:
                last_slide = slides[-1]  # Last slide (always problem)
                second_last_slide = slides[1]  # First slide (event slide), check type

                df_second_last = df_module[df_module["slide"] == second_last_slide]
                df_last = df_module[df_module["slide"] == last_slide]

                # Get last time user visited second-last slide
                last_time_second_last = df_second_last.groupby("user_id")["created_at"].max()
                # Get first time user visited last slide
                first_time_last = df_last.groupby("user_id")["created_at"].min()

                # Users who visited both
                common_users = last_time_second_last.index.intersection(first_time_last.index)

                for user in common_users:
                    t1 = last_time_second_last[user]
                    t2 = first_time_last[user]
                    gap = t2 - t1
                    if gap >= 0:
                        time_gaps.append(gap)

                results.append({
                    "challenge": challenge,
                    "module": module,
                    "mean_time_gap_hours": round(np.mean(time_gaps)/3600, 1),
                    "std_time_gap_hours": round(np.std(time_gaps)/3600, 1)
                })

    # Convert to DataFrame for pretty output
    time_gap_summary = pd.DataFrame(results)
    return time_gap_summary



In [60]:
time_gap_summary = get_time_gap_summary(df)
time_gap_summary

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

Unnamed: 0,challenge,module,mean_time_gap_hours,std_time_gap_hours
0,challenge-newbies-2018,w1p2,215.4,174.8
1,challenge-newbies-2018,w1p1,35.3,100.2
2,challenge-newbies-2018,w2p1,59.8,87.9
3,challenge-newbies-2018,w2p2,199.0,162.0
4,challenge-newbies-2018,w3p1n,76.7,97.5
5,challenge-newbies-2018,w3p2,117.8,125.5
6,challenge-newbies-2018,w4p1,45.2,54.9
7,challenge-newbies-2018,w4p2,87.1,66.8
8,challenge-newbies-2018,w5p1,27.6,41.9
9,challenge-newbies-2018,w5p2,32.9,22.2


In [61]:
def combine_at_challenge_level(df):
    # Group by challenge and calculate combined stats for each challenge
    challenge_results = []
    
    for challenge, challenge_df in df.groupby("challenge"):
        # Unweighted mean for each challenge
        mean = np.average(challenge_df["mean_time_gap_hours"])
        
        # Weighted variance for each challenge
        variance = np.average(
            challenge_df["std_time_gap_hours"]**2
        )  # This is the weighted variance from std
        
        # Compute combined standard deviation
        std = np.sqrt(variance)
        
        challenge_results.append({
            "challenge": challenge,
            "mean_time_gap_hours": round(mean, 2),
            "std_time_gap_hours": round(std, 2)
        })
    
    # Convert to DataFrame for nice output
    return pd.DataFrame(challenge_results)

In [62]:
# Apply the function
challenge_summary = combine_at_challenge_level(time_gap_summary)
print(challenge_summary)


                          challenge  mean_time_gap_hours  std_time_gap_hours
0          challenge-beginners-2018                  NaN                 NaN
1  challenge-beginners-blockly-2018                  NaN                 NaN
2       challenge-intermediate-2018                  NaN                 NaN
3            challenge-newbies-2018                89.68              104.66
