In [1]:
import pandas as pd
import os

# path to cleaned files
base = r"C:\Users\lenovo\Desktop\CV\vanguard_data\cleaned_data"

# load cleaned datasets
client = pd.read_csv(os.path.join(base, "client_clean.csv"))
experiment = pd.read_csv(os.path.join(base, "experiment_clean.csv"))
web = pd.read_csv(os.path.join(base, "web_clean.csv"))

client.head(), experiment.head(), web.head()


(   client_id  clnt_tenure_yr  clnt_tenure_mnth  clnt_age gendr  num_accts  \
 0     836976               6                73      60.5     U          2   
 1    2304905               7                94      58.0     U          2   
 2    1439522               5                64      32.0     U          2   
 3    1562045              16               198      49.0     M          2   
 4    5126305              12               145      33.0     F          2   
 
          bal  calls_6_mnth  logons_6_mnth  
 0   45105.30             6              9  
 1  110860.30             6              9  
 2   52467.79             6              9  
 3   67454.65             3              6  
 4  103671.75             0              3  ,
    client_id variation
 0    9988021      test
 1    8320017      test
 2    4033851   control
 3    1982004      test
 4    9294070   control,
    client_id            visitor_id                      visit_id  \
 0    9988021  580560515_7732621733  78125505

In [2]:
# merge client + experiment
df = client.merge(experiment, on="client_id", how="inner")

# merge with web logs
df = df.merge(web, on="client_id", how="left")

df.head()


Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,variation,visitor_id,visit_id,process_step,date_time
0,836976,6,73,60.5,U,2,45105.3,6,9,test,427070339_1413275162,228976764_46825473280_96584,,2017-04-02 11:51:13
1,836976,6,73,60.5,U,2,45105.3,6,9,test,427070339_1413275162,228976764_46825473280_96584,,2017-04-02 11:47:50
2,836976,6,73,60.5,U,2,45105.3,6,9,test,427070339_1413275162,228976764_46825473280_96584,,2017-04-02 11:46:45
3,836976,6,73,60.5,U,2,45105.3,6,9,test,427070339_1413275162,228976764_46825473280_96584,,2017-04-02 11:23:08
4,836976,6,73,60.5,U,2,45105.3,6,9,test,427070339_1413275162,228976764_46825473280_96584,,2017-04-02 11:22:24


In [3]:
# find the final process step
final_step = df["process_step"].max()
final_step


np.float64(nan)

In [4]:
df["completed"] = (df["process_step"] == final_step).astype(int)


In [5]:
df.groupby("variation")["completed"].mean()


variation
control    0.0
test       0.0
Name: completed, dtype: float64

In [12]:
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# 1) Make sure process_step is numeric
df["process_step"] = pd.to_numeric(df["process_step"], errors="coerce")

# 2) Define the final step (max existing step)
final_step = df["process_step"].max()
print("Final process step detected:", final_step)

# 3) Mark each row as completed (session-level)
df["completed_row"] = (df["process_step"] == final_step).astype(int)

# 4) Aggregate to CLIENT level:
#    a client is completed if ANY of their rows reached the final step
client_level = (
    df.groupby(["client_id", "variation"], as_index=False)
      ["completed_row"].max()
      .rename(columns={"completed_row": "completed"})
)

print("\nClient-level table (head):")
display(client_level.head())

print("\nCompletion by variation (client-level):")
print(client_level.groupby("variation")["completed"].mean())


Final process step detected: nan

Client-level table (head):


Unnamed: 0,client_id,variation,completed
0,555,test,0
1,647,test,0
2,934,test,0
3,1028,control,0
4,1104,control,0



Completion by variation (client-level):
variation
control    0.0
test       0.0
Name: completed, dtype: float64


In [13]:
web.columns


Index(['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time'], dtype='object')

In [16]:
import os
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# === 1. Load experiment_clean from cleaned_data ===
base_clean = r"C:\Users\lenovo\Desktop\CV\vanguard_data\cleaned_data"
exp = pd.read_csv(os.path.join(base_clean, "experiment_clean.csv"))
print("Experiment shape:", exp.shape)

# === 2. Load RAW web data parts from original folder ===
base_raw = r"C:\Users\lenovo\Desktop\CV\vanguard_data"

web1 = pd.read_csv(os.path.join(base_raw, "df_final_web_data_pt_1.csv.txt"))
web2 = pd.read_csv(os.path.join(base_raw, "df_final_web_data_pt_2.csv.txt"))

web = pd.concat([web1, web2], ignore_index=True)
print("Raw web combined shape:", web.shape)
print("Raw web columns:", web.columns.tolist())


Experiment shape: (50500, 2)
Raw web combined shape: (755405, 5)
Raw web columns: ['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time']


In [17]:
# Keep the original process_step for reference
web["process_step_raw"] = web["process_step"]

# Extract the first number from the process_step string, e.g. "step_1" -> 1
web["process_step_num"] = (
    web["process_step_raw"]
    .astype(str)               # convert to string
    .str.extract(r"(\d+)")[0]  # extract first digits
)

# Convert extracted string to numeric
web["process_step_num"] = pd.to_numeric(web["process_step_num"], errors="coerce")

# Drop rows where we still don't have a valid step
web_valid = web.dropna(subset=["process_step_num"]).copy()
web_valid["process_step_num"] = web_valid["process_step_num"].astype(int)

print("Web_valid shape:", web_valid.shape)
print("Unique numeric steps (first few):", sorted(web_valid["process_step_num"].unique())[:10])


Web_valid shape: (408497, 7)
Unique numeric steps (first few): [np.int64(1), np.int64(2), np.int64(3)]


In [18]:
# Make sure client_id is numeric in both
exp["client_id"] = pd.to_numeric(exp["client_id"], errors="coerce")
web_valid["client_id"] = pd.to_numeric(web_valid["client_id"], errors="coerce")

exp = exp.dropna(subset=["client_id"]).copy()
web_valid = web_valid.dropna(subset=["client_id"]).copy()

exp["client_id"] = exp["client_id"].astype("Int64")
web_valid["client_id"] = web_valid["client_id"].astype("Int64")

# Standardize variation column
if "Variation" in exp.columns and "variation" not in exp.columns:
    exp = exp.rename(columns={"Variation": "variation"})

exp["variation"] = (
    exp["variation"]
    .astype(str)
    .str.strip()
    .str.lower()
)

# Keep only control/test rows
exp = exp[exp["variation"].isin(["control", "test"])].copy()

print("Variation distribution after cleaning:")
print(exp["variation"].value_counts(dropna=False))

# Merge web_valid + experiment on client_id
ab = web_valid.merge(
    exp[["client_id", "variation"]].drop_duplicates(),
    on="client_id",
    how="inner"
)

print("\nAB merged shape:", ab.shape)
if ab.empty:
    print("AB is empty after merge. Still no overlap between experiment clients and web logs.")
else:
    print("Variation distribution in merged AB:")
    print(ab["variation"].value_counts(dropna=False))


Variation distribution after cleaning:
variation
test       26968
control    23532
Name: count, dtype: int64

AB merged shape: (174000, 8)
Variation distribution in merged AB:
variation
test       95651
control    78349
Name: count, dtype: int64


In [19]:
from statsmodels.stats.proportion import proportions_ztest

# 1 — Determine the final step users can reach
final_step = ab["process_step_num"].max()
print("Final numeric process step:", final_step)

# 2 — For each client, record the maximum step they reached
client_level = (
    ab.groupby(["client_id", "variation"], as_index=False)["process_step_num"]
      .max()
      .rename(columns={"process_step_num": "max_step"})
)

# 3 — Mark completion (1 = completed the funnel)
client_level["completed"] = (client_level["max_step"] >= final_step).astype(int)

print("\nClient-level head:")
display(client_level.head())

# 4 — Compute completion rate per variant
print("\nCompletion rate by variation:")
completion_rates = client_level.groupby("variation")["completed"].mean()
print(completion_rates)

# 5 — Prepare summary stats for z-test
summary = client_level.groupby("variation")["completed"].agg(["sum", "count"])
print("\nSummary used for z-test:")
print(summary)

# 6 — Only run test if we have both control and test
if summary.shape[0] != 2:
    print("\nError: need both control and test groups for z-test.")
else:
    completed = summary["sum"].values
    totals = summary["count"].values

    print("\nCompleted array:", completed)
    print("Totals array:", totals)

    # One-sided test: Test > Control
    z_stat, p_value = proportions_ztest(
        count=completed,
        nobs=totals,
        alternative="larger"
    )

    print("\nZ-score:", z_stat)
    print("p-value:", p_value)


Final numeric process step: 3

Client-level head:


Unnamed: 0,client_id,variation,max_step,completed
0,555,test,3,1
1,647,test,3,1
2,1028,control,3,1
3,1186,control,2,0
4,1195,control,3,1



Completion rate by variation:
variation
control    0.862390
test       0.859018
Name: completed, dtype: float64

Summary used for z-test:
             sum  count
variation              
control    17422  20202
test       20881  24308

Completed array: [17422 20881]
Totals array: [20202 24308]

Z-score: 1.02250098722172
p-value: 0.15327192282029733
