In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st

In [3]:
df_merged = pd.read_csv("../../data/clean/merged_final_demo_final_experiment_clients_df.csv")
df_merged

Unnamed: 0,client_id,client_tenure_years,client_tenure_months,client_age,gender,num_accounts,balance,calls_last_6_months,logons_last_6_months,variation
0,836976,6,73,60,Unspecified,2,45105.30,6,9,Test
1,2304905,7,94,58,Unspecified,2,110860.30,6,9,Control
2,1439522,5,64,32,Unspecified,2,52467.79,6,9,Test
3,1562045,16,198,49,Male,2,67454.65,3,6,Test
4,5126305,12,145,33,Female,2,103671.75,0,3,Control
...,...,...,...,...,...,...,...,...,...,...
70589,7993686,4,56,38,Unspecified,3,1411062.68,5,5,Unknown
70590,8981690,12,148,31,Male,2,101867.07,6,6,Unknown
70591,333913,16,198,61,Female,2,40745.00,3,3,Unknown
70592,1573142,21,255,68,Male,3,475114.69,4,4,Unknown


In [5]:
df_combined = pd.read_csv("../../data/raw/combined_cleaned_data.csv")
df_combined

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
744636,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
744637,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
744638,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
744639,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


Hypothesis

Null Hypothesis (H0​): The average client tenure of those engaging with the new process is equal to the average client tenure of those engaging with the old process

(μnew_process = μold_process)

Alternative Hypothesis (H1​): The average client tenure of those engaging with the new process is not equal to the average client tenure of those engaging with the old process.

(μnew_proces s≠ μold_process)

In [7]:
df_merged.groupby('variation')['client_tenure_years'].mean()

variation
Control    12.087860
Test       11.982901
Unknown    12.106232
Name: client_tenure_years, dtype: float64

In [9]:
df_merged.groupby('variation')['client_tenure_years'].std(ddof=1)

variation
Control    6.877626
Test       6.844947
Unknown    6.900585
Name: client_tenure_years, dtype: float64

$$H0: mean_test = mean_control$$
$$H1: mean_test != mean_control$$

$$H0: mean_test - mean_control = 0$$
$$H1: mean_test - mean_control \ne 0$$

In [11]:
# Definir promedios
mean_control = df_merged.groupby('variation')['client_tenure_years'].mean()['Control']
mean_test = df_merged.groupby('variation')['client_tenure_years'].mean()['Test']

# Definir desviaciones estándar
std_control = df_merged.groupby('variation')['client_tenure_years'].std(ddof=1)['Control']
std_test = df_merged.groupby('variation')['client_tenure_years'].std(ddof=1)['Test']

# Definir tamaños de muestra
n_control = df_merged[df_merged['variation'] == 'Control'].shape[0]
n_test = df_merged[df_merged['variation'] == 'Test'].shape[0]

# Imprimir resultados para verificar
print("Mean (Control):", mean_control)
print("Mean (Test):", mean_test)
print("Std (Control):", std_control)
print("Std (Test):", std_test)
print("N (Control):", n_control)
print("N (Test):", n_test)

Mean (Control): 12.087860239734761
Mean (Test): 11.98290122769927
Std (Control): 6.87762591840498
Std (Test): 6.844947446400878
N (Control): 23526
N (Test): 26961


In [15]:
diff = (mean_test - mean_control) 

sp2 = (n_test - 1) * (std_test** 2) + (n_control - 1) * (std_control** 2) / (n_test + n_control - 2 ) 

d = np.sqrt( (sp2 / n_test) + (sp2 / n_control) )

stat = diff / d

alpha = 0.05

# Con estaditico
lower_critical = st.t.ppf(0.05/2, n_test + n_control - 2)
upper_critical = st.t.ppf(1-(0.05/2), n_test + n_control - 2)

if ( stat > lower_critical and stat < upper_critical):
    print("We accept H0")
else:
    print("We reject H0")

# Con p_value#
#left_statistic_area = st.t.cdf(stat, n_test + n_control - 2)
#right_statistic_area = 1 - st.t.cdf(stat, n_test + n_control - 2)

We accept H0


Hipothesis conclusion:
- Hypotesis (H0)  sustained
- Hypothesis (H1) not sustained

In [5]:
# Data for the bar chart
groups = ['Control', 'Test', 'Unknown']
means = [12.087860, 11.982901, 12.106232]
std_devs = [6.877626, 6.844947, 6.900585]

# Create the bar chart
plt.figure(figsize=(8, 6))
plt.bar(groups, means, yerr=std_devs, capsize=5, color=['blue', 'orange', 'green'], alpha=0.7)

# Add labels and title
plt.xlabel('Groups', fontsize=12)
plt.ylabel('Average Client Tenure (Years)', fontsize=12)
plt.title('Average Client Tenure by Group with Error Bars', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Show the chart
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined