In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = '../../data/clean/combined_data_test.txt'
df = pd.read_csv(url)

In [2]:
df.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
0,9999832,145538019_54444341400,472154369_16714624241_585315,step_1,2017-05-16 16:46:11,23.0,281.0,49.0,F,2.0,431887.61,1.0,4.0,Test
1,9999832,145538019_54444341400,472154369_16714624241_585315,start,2017-05-16 16:46:03,23.0,281.0,49.0,F,2.0,431887.61,1.0,4.0,Test
2,9999729,604429154_69247391147,99583652_41711450505_426179,step_1,2017-04-05 13:41:04,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
3,9999729,604429154_69247391147,99583652_41711450505_426179,start,2017-04-05 13:40:49,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
4,9999729,834634258_21862004160,870243567_56915814033_814203,step_3,2017-05-08 16:09:19,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
5,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
6,9999729,834634258_21862004160,870243567_56915814033_814203,step_1,2017-05-08 16:08:30,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
7,9999729,834634258_21862004160,870243567_56915814033_814203,start,2017-05-08 16:08:25,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
8,9999729,834634258_21862004160,870243567_56915814033_814203,confirm,2017-05-08 16:09:40,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
9,9999729,843385170_36953471821,493310979_9209676464_421146,step_2,2017-04-20 14:27:36,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test


In [3]:
# Error rate
# Ensure date_time is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort by client_id, visit_id, and date_time to ensure chronological order
df = df.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Define the process steps in the correct order
process_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Create a new column that shows the next step
df['next_step'] = df.groupby(['client_id', 'visit_id'])['process_step'].shift(-1)

# Identify step-back errors
df['is_step_back'] = df.apply(
    lambda row: process_order.index(row['next_step']) < process_order.index(row['process_step']) 
    if pd.notnull(row['next_step']) else False, axis=1
)

# Calculate total steps and total errors
total_steps = len(df)  # Total number of rows (steps)
total_step_back_errors = df['is_step_back'].sum()  # Sum of step-back errors

# Calculate the overall error rate
total_error_rate = total_step_back_errors / total_steps * 100

print(f"Total number of steps: {total_steps}")
print(f"Total number of step-back errors: {total_step_back_errors}")
print(f"Total error rate: {total_error_rate:.6f}%")



Total number of steps: 176641
Total number of step-back errors: 16248
Total error rate: 9.198317%


In [11]:
# Completion rate
# Total number of visitors (unique visit_id)
total_visitors = df['visit_id'].nunique()

# Number of visitors who reached the 'confirm' step
visitors_with_confirm = df[df['process_step'] == 'confirm']['visit_id'].nunique()

# Calculate the completion rate
completion_rate = (visitors_with_confirm / total_visitors) * 100

print(f"Total number of visitors: {total_visitors}")
print(f"Number of visitors who reached the 'confirm' step: {visitors_with_confirm}")
print(f"Completion rate: {completion_rate:.2f}%")


58.52324766984538

Total number of visitors: 37122
Number of visitors who reached the 'confirm' step: 21725
Completion rate: 58.52%


In [79]:
test_average_age = df['clnt_age'].mean()
print ("The average age of the control group is", test_average_age)

The average age of the control group is 48.72221624651129


In [83]:
test_average_history = df['clnt_tenure_yr'].mean()
print ("The average history of the control group is", test_average_history, "years")

The average history of the control group is 12.182024558284883 years


In [11]:
# Average time spent on each step
# Convert 'date_time' to datetime
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort the data for proper time difference calculation
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Calculate time spent in seconds for each step
df['time_spent'] = df.groupby(['client_id', 'visitor_id', 'visit_id'])['date_time'].diff().dt.total_seconds()

# Drop NaN values (first step has no previous step for time difference calculation)
df = df.dropna(subset=['time_spent'])
# Group by 'process_step' and calculate the mean time spent
mean_time_per_step = df.groupby('process_step')['time_spent'].mean().reset_index()

# Rename columns for clarity
mean_time_per_step.columns = ['process_step', 'mean_time_spent']
mean_time_per_step


Unnamed: 0,process_step,mean_time_spent
0,confirm,126.915365
1,start,154.803267
2,step_1,52.378948
3,step_2,48.291934
4,step_3,97.528864
