# Week 1 Seminar

A/B experiment analysis

In [8]:
import glob
from collections import namedtuple

import pandas as pd
import numpy as np
import scipy.stats as ss

import matplotlib.pyplot as plt

pd.set_option("display.precision", 3)

%matplotlib inline

In [9]:
experiment = "MVP"

In [10]:
data = pd.concat([
    pd.read_json(data_path, lines=True)
    for data_path
    in glob.glob("5000_mvp_3/*/data.json")
])

data["treatment"] = data["experiments"].map(lambda experiments: experiments[experiment])

In [11]:
data

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,treatment
0,next,2025-05-18 23:45:19.526,3255,40471,1.00,2.212e-03,41042.0,{'MVP': 'C'},C
1,next,2025-05-18 23:45:19.545,3255,40466,0.64,1.757e-03,40464.0,{'MVP': 'C'},C
2,next,2025-05-18 23:45:19.558,3255,40466,0.00,1.741e-03,40464.0,{'MVP': 'C'},C
3,next,2025-05-18 23:45:19.572,3255,40469,0.26,1.534e-03,40468.0,{'MVP': 'C'},C
4,last,2025-05-18 23:45:19.583,3255,40464,0.00,3.529e-04,,{'MVP': 'C'},C
...,...,...,...,...,...,...,...,...,...
22662,next,2025-05-18 23:53:52.405,5334,44575,0.51,5.494e-03,34604.0,{'MVP': 'T1'},T1
22663,next,2025-05-18 23:53:52.423,5334,40644,0.91,4.491e-03,32800.0,{'MVP': 'T1'},T1
22664,next,2025-05-18 23:53:52.438,9969,3393,1.00,2.431e-03,3393.0,{'MVP': 'C'},C
22665,next,2025-05-18 23:53:52.450,9969,3393,0.00,1.529e-03,3393.0,{'MVP': 'C'},C


In [12]:
data.groupby("treatment").count()

Unnamed: 0_level_0,message,timestamp,user,track,time,latency,recommendation,experiments
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,20106,20106,20106,20106,20106,20106,17581,20106
T1,25229,25229,25229,25229,25229,25229,22754,25229


## Visualize sessions

In [13]:
Session = namedtuple("Session", ["timestamp", "tracks", "time", "latency"])

def sessionize(user_data):
    sessions = []
    session = None
    for _, row in user_data.sort_values("timestamp").iterrows():
        if session is None:
            session = Session(row["timestamp"], 0, 0, 0)
        
        session = session._replace(
            tracks = session.tracks + 1, 
            time = session.time + row["time"],
            latency = session.latency + row["latency"] * 1000,
        )
        
        if row["message"] == "last":
            sessions.append(session._asdict())
            session = None
    return sessions

In [14]:
sessions = (
    data
    .groupby(["user", "treatment"])
    .apply(sessionize)
    .explode()
    .apply(pd.Series)
)

  .apply(sessionize)


In [15]:
figure, ax = plt.subplots(figsize=(15, 5))
for treatment, treatment_sessions in sessions.groupby("treatment"):
    reindexed_treatment_sessions = treatment_sessions.sort_values("timestamp").reset_index()
    ax.plot(reindexed_treatment_sessions.index, reindexed_treatment_sessions["tracks"], label=treatment)
ax.legend()
pass

In [16]:
figure, ax = plt.subplots(figsize=(15, 5))
for treatment, treatment_sessions in sessions.groupby("treatment"):
    reindexed_treatment_sessions = treatment_sessions.sort_values("timestamp").reset_index()
    ax.plot(reindexed_treatment_sessions.index, reindexed_treatment_sessions["time"], label=treatment)
ax.legend()
pass

## Analyze the experiment

In [17]:
user_level_data = (
  sessions
    .reset_index()
    .groupby(["user", "treatment"])
    .agg(
        {
            "timestamp": "count",
            "tracks": "sum",
            "time": "sum",
            "latency": "sum"
        }
    )
)

user_level_data["sessions"] = user_level_data["timestamp"]
user_level_data["mean_request_latency"] = user_level_data["latency"] / user_level_data["tracks"]
user_level_data["mean_tracks_per_session"] = user_level_data["tracks"] / user_level_data["sessions"]
user_level_data["mean_time_per_session"] = user_level_data["time"] / user_level_data["sessions"]

metrics = [
    "time", 
    "sessions", 
    "mean_request_latency", 
    "mean_tracks_per_session", 
    "mean_time_per_session"
]

user_level_metrics = user_level_data[metrics].copy().reset_index()

In [18]:
treatment_level_metrics = (
    user_level_metrics
    .groupby("treatment")[metrics]
    .agg(["count", "mean", "var"])
)

In [19]:
def dof(n_0, n_1, s2_0, s2_1):
    numerator = (s2_0 / n_0 + s2_1 / n_1) * (s2_0 / n_0 + s2_1 / n_1)
    denominator = s2_0 * s2_0 / n_0 / n_0 / (n_0 - 1) + s2_1 * s2_1 / n_1 / n_1 / (n_1 - 1)
    return numerator / denominator


def ci(n_0, n_1, s2_0, s2_1, alpha=0.05):
    return ss.t.ppf(1 - alpha/2, dof(n_0, n_1, s2_0, s2_1)) * np.sqrt(s2_0 / n_0 + s2_1 / n_1)


effects = []

control = [data for treatment, data in treatment_level_metrics.iterrows() if treatment == "C"][0]

for treatment, row in treatment_level_metrics.iterrows():
    if treatment == "C":
        continue
    
    for metric in metrics:
        control_mean = control[metric]["mean"]
        treatment_mean = row[metric]["mean"]
        
        effect = treatment_mean - control_mean
        conf_int = ci(
            control[metric]["count"],
            row[metric]["count"],
            control[metric]["var"],
            row[metric]["var"],
        )
        effects.append({
            "treatment": treatment,
            "metric": metric,
            "control_mean": control_mean,
            "treatment_mean": treatment_mean,
            "effect": effect / control_mean * 100,
            "lower": (effect - conf_int) / control_mean * 100,
            "upper": (effect + conf_int) / control_mean * 100,
            "significant": (effect + conf_int) * (effect - conf_int) > 0
        })

In [20]:
def color(value):
    return 'color:red;' if value < 0 else 'color:green;'

def background(value):
    return 'color:white;background-color:green' if value else 'color:white;background-color:red'
        

(
    pd.DataFrame(effects)[[
        "treatment", 
        "metric",
        "effect", 
        "upper", 
        "lower", 
        "control_mean", 
        "treatment_mean",
        "significant"
    ]]
    .sort_values(["metric", "treatment"], ascending=False)
    .style
    .applymap(color, subset=["effect", "upper", "lower"])
    .applymap(background, subset=["significant"])
)

  .applymap(color, subset=["effect", "upper", "lower"])
  .applymap(background, subset=["significant"])


Unnamed: 0,treatment,metric,effect,upper,lower,control_mean,treatment_mean,significant
0,T1,time,76.150008,84.19059,68.109426,3.792219,6.679995,True
1,T1,sessions,0.718529,3.46447,-2.027412,1.276542,1.285714,False
3,T1,mean_tracks_per_session,27.626686,30.376759,24.876614,7.975649,10.179056,True
4,T1,mean_time_per_session,73.89222,80.765236,67.019203,2.97971,5.181483,True
2,T1,mean_request_latency,121.62597,123.169441,120.082499,1.593449,3.531496,True
