In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = '../../data/clean/combined_data_control.txt'
df = pd.read_csv(url)

In [6]:
df.shape

(140482, 14)

In [12]:
# Error rate
# Ensure date_time is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort by client_id, visit_id, and date_time to ensure chronological order
df = df.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Define the process steps in the correct order
process_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Create a new column that shows the next step
df['next_step'] = df.groupby(['client_id', 'visit_id'])['process_step'].shift(-1)

# Identify step-back errors
df['is_step_back'] = df.apply(
    lambda row: process_order.index(row['next_step']) < process_order.index(row['process_step']) 
    if pd.notnull(row['next_step']) else False, axis=1
)

# Calculate total steps and total errors
total_steps = len(df)  # Total number of rows (steps)
total_step_back_errors = df['is_step_back'].sum()  # Sum of step-back errors

# Calculate the overall error rate
total_error_rate = total_step_back_errors / total_steps * 100

print(f"Total number of steps: {total_steps}")
print(f"Total number of step-back errors: {total_step_back_errors}")
print(f"Total error rate: {total_error_rate:.6f}%")


Total number of steps: 140482
Total number of step-back errors: 9576
Total error rate: 6.816532%


In [10]:
# Colmpletion rate
# Total number of visitors (unique visit_id)
total_visitors = df['visit_id'].nunique()

# Number of visitors who reached the 'confirm' step
visitors_with_confirm = df[df['process_step'] == 'confirm']['visit_id'].nunique()

# Calculate the completion rate
completion_rate = (visitors_with_confirm / total_visitors) * 100

print(f"Total number of visitors: {total_visitors}")
print(f"Number of visitors who reached the 'confirm' step: {visitors_with_confirm}")
print(f"Completion rate: {completion_rate:.2f}%")


Total number of visitors: 32181
Number of visitors who reached the 'confirm' step: 16039
Completion rate: 49.84%


In [18]:
control_average_age = df['clnt_age'].mean()
print ("The average age of the control group is", control_average_age)

The average age of the control group is 48.284477726683846


In [20]:
test_average_history = df['clnt_tenure_yr'].mean()
print ("The average history of the control group is", test_average_history, "years")

The average history of the control group is 12.16779373869962 years


In [7]:
# Average time spent on each step
# Convert 'date_time' to datetime
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort the data for proper time difference calculation
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Calculate time spent in seconds for each step
df['time_spent'] = df.groupby(['client_id', 'visitor_id', 'visit_id'])['date_time'].diff().dt.total_seconds()

# Drop NaN values (first step has no previous step for time difference calculation)
df = df.dropna(subset=['time_spent'])
# Group by 'process_step' and calculate the mean time spent
mean_time_per_step = df.groupby('process_step')['time_spent'].mean().reset_index()

# Rename columns for clarity
mean_time_per_step.columns = ['process_step', 'mean_time_spent']
mean_time_per_step


Unnamed: 0,process_step,mean_time_spent
0,confirm,129.598746
1,start,196.201373
2,step_1,57.414272
3,step_2,38.888074
4,step_3,93.57867
