In [31]:
import pandas as pd
import numpy as np

In [45]:
df = pd.read_csv('data/processed_data.csv')

In [46]:
df["created_at"][0]

np.int64(1532680322)

In [33]:
challenges = pd.unique(df["challenge"])
print(challenges)

['challenge-newbies-2018' 'challenge-beginners-2018'
 'challenge-intermediate-2018' 'challenge-advanced-2018'
 'challenge-beginners-blockly-2018']


In [49]:
# Get list of challenges
challenges = pd.unique(df["challenge"])

# Set up a results list to collect summary rows
results = []

for challenge in challenges:
    # Filter all rows for this challenge
    df_chal = df[df["challenge"] == challenge]

    # All unique users in this challenge
    all_users = set(df_chal["user_id"].unique())

    # Users who ran problem w5p1 in this challenge
    ran_w5p1_users = set(
        df_chal[
            (df_chal["event_name"] == "problem_run") & 
            (df_chal["problem"] == "w5p1")
        ]["user_id"].unique()
    )

    # Users who ran problem w5p2 in this challenge
    ran_w5p2_users = set(
        df_chal[
            (df_chal["event_name"] == "problem_run") & 
            (df_chal["problem"] == "w5p2")
        ]["user_id"].unique()
    )

    # Append results
    results.append({
        "challenge": challenge,
        "unique_users": len(all_users),
        "ran_w5p1": len(ran_w5p1_users),
        "ran_w5p2": len(ran_w5p2_users)
    })

# Convert to DataFrame for pretty output
summary_df = pd.DataFrame(results)

summary_df["pct_ran_w5p1"] = (summary_df["ran_w5p1"] / summary_df["unique_users"]) * 100
summary_df["pct_ran_w5p2_of_w5p1"] = (summary_df["ran_w5p2"] / summary_df["ran_w5p1"]) * 100

# Round for readability
summary_df = summary_df.round({
    "pct_ran_w5p1": 2,
    "pct_ran_w5p2_of_w5p1": 2
})

# Print the summary
print(summary_df)


                          challenge  unique_users  ran_w5p1  ran_w5p2  \
0            challenge-newbies-2018          3976       542       450   
1          challenge-beginners-2018          8085      2946      2484   
2       challenge-intermediate-2018          4868      1771      1592   
3           challenge-advanced-2018          1326         0         0   
4  challenge-beginners-blockly-2018          2521       290       194   

   pct_ran_w5p1  pct_ran_w5p2_of_w5p1  
0         13.63                 83.03  
1         36.44                 84.32  
2         36.38                 89.89  
3          0.00                   NaN  
4         11.50                 66.90  


In [43]:
print(df["event_data"][0])

{"problem_status":null,"url":"https:\/\/groklearning.com\/learn\/challenge-newbies-2018\/w1p2\/5\/","problem_id":5018}


In [47]:
import pandas as pd

# Ensure 'created_at' is in datetime format
# df["created_at"] = pd.to_datetime(df["created_at"])

# Filter to relevant problem_run events for w5p1 and w5p2
df_runs = df[
    (df["event_name"] == "problem_run") &
    (df["problem"].isin(["w5p1", "w5p2"]))
]

print(df_runs.head())

# Sort and get first run time for each user and problem
first_runs = (
    df_runs.sort_values("created_at")
           .groupby(["user_id", "problem"])["created_at"]
           .first()
           .unstack()
)

# Drop users who didn't run both problems
first_runs = first_runs.dropna(subset=["w5p1", "w5p2"])

# Compute time difference in seconds
first_runs["diff_seconds"] = (first_runs["w5p2"] - first_runs["w5p1"])

# Get average and standard deviation
average_diff = first_runs["diff_seconds"].mean()
std_diff = first_runs["diff_seconds"].std()

# Print results
print(f"Average time difference (seconds): {average_diff/3600:.2f}")
print(f"Standard deviation (seconds): {std_diff/3600:.2f}")


         event_name  created_at                           user_id  \
277242  problem_run  1533034393  7f11d5562c25a3b883771942dc12b484   
277312  problem_run  1533034472  7f11d5562c25a3b883771942dc12b484   
546694  problem_run  1533274731  2b38c7bc12fe9895058b6003d6cdf54d   
546707  problem_run  1533274748  2b38c7bc12fe9895058b6003d6cdf54d   
546724  problem_run  1533274765  2b38c7bc12fe9895058b6003d6cdf54d   

                                               event_data condition  \
277242  {"problem_status":null,"url":"https:\/\/grokle...  everyone   
277312  {"problem_status":null,"url":"https:\/\/grokle...  everyone   
546694  {"problem_status":null,"url":"https:\/\/grokle...  everyone   
546707  {"problem_status":null,"url":"https:\/\/grokle...  everyone   
546724  {"problem_status":null,"url":"https:\/\/grokle...  everyone   

                          challenge problem  
277242     challenge-beginners-2018    w5p2  
277312     challenge-beginners-2018    w5p2  
546694  challenge-in