In [None]:
import pandas as pd
df = pd.read_csv('../../data/raw/combined_cleaned_data.csv')
df

In [None]:
# drop columns
df = df.drop(columns=['visitor_id', 'visit_id'])
# change the naming of the steps to numeric values to ensure order
replacement_dict_steps = {
    'start' : 0,
    'step_1' : 1,
    'step_2' : 2,
    'step_3' : 3,
    'confirm' : 4
}

df['process_step'] = df['process_step'].map(replacement_dict_steps)

In [None]:
# sort df by client_id and date_time
df = df.sort_values(by=['client_id', 'date_time'])
# print(kpi_df)

df.head(50)

In [None]:
# Detect backward steps (errors) for all clients
df['backward_step'] = df.groupby('client_id')['process_step'].diff() < 0

# Count the number of unique clients who experienced at least one error
clients_with_errors = df[df['backward_step']]['client_id'].nunique()

# Count the total number of errors for each client
errors_per_client = df[df['backward_step']].groupby('client_id').size()

# Total number of errors across all clients
total_errors = df['backward_step'].sum()

# Output results
print(f'Number of clients with at least one error: {clients_with_errors}')
print('Errors per client:')
print(errors_per_client)
print(f'Total number of errors across all clients: {total_errors}')

In [None]:
# Crear un DataFrame con los resultados
results_df = errors_per_client.reset_index()
results_df.columns = ['client_id', 'number_of_errors']

# Guardar el DataFrame en un archivo CSV
results_df.to_csv('errors_per_client.csv', index=False)

# Mensaje de confirmación
print('El archivo "errors_per_client.csv" se ha guardado correctamente.')