In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import scipy.stats as st
from statsmodels.formula.api import ols

In [71]:
url = '../data/clean/control_group_filtered.csv'
df_control = pd.read_csv(url)

In [73]:
url = '../data/clean/test_group_filtered.csv'
df_test = pd.read_csv(url)

In [75]:
df_control

Unnamed: 0,visit_id,start_time,confirm_time,process_duration
0,10006594_66157970412_679648,2017-04-13 11:50:18,2017-04-13 11:56:12,5.900000
1,10007589_47780784567_391490,2017-05-18 07:51:32,2017-05-18 08:03:33,12.016667
2,100254180_47139859079_984581,2017-04-05 21:42:02,2017-04-05 21:47:43,5.683333
3,100481857_71511233596_788753,2017-04-25 18:15:48,2017-04-25 18:28:44,12.933333
4,100733473_61604582110_215085,2017-04-17 17:12:44,2017-04-17 17:18:19,5.583333
...,...,...,...,...
4972,999027606_14420282929_137267,2017-04-19 08:43:49,2017-04-19 08:49:14,5.416667
4973,999060107_70703440582_987099,2017-04-05 17:42:09,2017-04-05 17:48:16,6.116667
4974,999358338_38217364343_327998,2017-04-12 17:36:51,2017-04-12 17:42:18,5.450000
4975,999528902_49133507319_516085,2017-03-29 17:59:40,2017-03-29 18:08:13,8.550000


In [77]:
df_test

Unnamed: 0,visit_id,start_time,confirm_time,process_duration
0,100217156_67053490690_383412,2017-04-12 11:51:12,2017-04-12 12:08:09,16.950000
1,100258507_71262593004_214494,2017-03-29 12:43:41,2017-03-29 12:49:32,5.850000
2,100412222_23957663994_946900,2017-04-05 11:30:51,2017-04-05 11:38:37,7.766667
3,100751264_35071231525_844320,2017-03-29 11:36:35,2017-03-29 11:42:47,6.200000
4,100758381_89232150997_752407,2017-06-08 22:15:34,2017-06-08 22:21:20,5.766667
...,...,...,...,...
4764,999538669_22796867619_900789,2017-04-10 20:48:34,2017-04-10 20:54:09,5.583333
4765,999665451_25261449194_83173,2017-05-12 19:27:32,2017-05-12 19:37:24,9.866667
4766,999817126_86162597254_111325,2017-03-29 11:17:53,2017-03-29 11:23:46,5.883333
4767,999891710_95999857132_598498,2017-04-12 15:03:52,2017-04-12 15:16:21,12.483333


In [79]:
filtered_df_control= df_control["process_duration"]
filtered_df_control

0        5.900000
1       12.016667
2        5.683333
3       12.933333
4        5.583333
          ...    
4972     5.416667
4973     6.116667
4974     5.450000
4975     8.550000
4976    17.500000
Name: process_duration, Length: 4977, dtype: float64

In [81]:
filtered_df_test= df_test["process_duration"]
filtered_df_test

0       16.950000
1        5.850000
2        7.766667
3        6.200000
4        5.766667
          ...    
4764     5.583333
4765     9.866667
4766     5.883333
4767    12.483333
4768    12.133333
Name: process_duration, Length: 4769, dtype: float64

# Hypothesis
# H0: mu_process_duration control = mu_process_duration test
# H1: mu_process_duration control != mu_process_duration test

In [84]:
# Perform the two-sample t-test
t_stat, p_value = st.ttest_ind(filtered_df_control, filtered_df_test, equal_var=False)

# Print results
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Based on the p-value, you can decide whether to reject H0
alpha = 0.05  # Common significance level
if p_value < alpha:
    print("Reject the null hypothesis H0: There is a significant difference.")
else:
    print("Fail to reject the null hypothesis H0: No significant difference.")

T-statistic: -6.215889078049386
P-value: 5.319407192817486e-10
Reject the null hypothesis H0: There is a significant difference.


The results of your t-test are as follows:

- **T-statistic: -6.22**
- **P-value: 5.32e-10 (which is 0.000000000532)**

### Interpretation:

1. **T-statistic**:
   - The negative value of the t-statistic (-6.22) indicates that the mean of `df_control['process_duration']` is smaller than the mean of `df_test['process_duration']`. 
   - The magnitude of the t-statistic (6.22) suggests a large difference between the two groups relative to the variability within each group.

2. **P-value**:
   - The p-value is extremely small (`5.32e-10`), which is much smaller than typical significance levels (e.g., 0.05, 0.01, or 0.001).
   - A small p-value indicates that the observed difference between the two means is highly unlikely to have occurred by random chance if the null hypothesis were true.

### Conclusion:

- **Reject the null hypothesis (H₀)**: Since the p-value is much smaller than the common significance level (such as 0.05), you reject the null hypothesis that the mean process durations of the control and test groups are equal.
  
- **Alternative hypothesis (H₁)**: There is a statistically significant difference between the process durations of the control group and the test group.

In simple terms, based on this test, you can confidently say that the process durations for the control and test groups are **significantly different**. Given the negative t-statistic, it suggests that the **control group** has a shorter process duration compared to the test group, but the main takeaway is the significant difference between the two means.

In [87]:
df_test_sorted = df_test.sort_values(by='process_duration', ascending=True)
# df_test_sorted.to_csv('sorted_test_data.csv', index=False)

In [89]:
df_control_sorted = df_control.sort_values(by='process_duration', ascending=True)
# df_control_sorted.to_csv('sorted_control_data.csv', index=False)

In [91]:
df_test_sorted.process_duration.mean()

9.090305444887118

In [93]:
df_control_sorted.process_duration.mean()

8.6580001339495

In [101]:
# Perform one-way ANOVA
f_statistic, p_value = st.f_oneway(filtered_df_control, filtered_df_test)

print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

F-statistic: 38.86310892972295
P-value: 4.734350178634918e-10


# Conclusions:
## Based on this ANOVA test, we can confidently state that the process duration in the control group is statistically different from the process duration in the test group.
## Since we rejected the null hypothesis (H₀), we accept the alternative hypothesis (H₁), which states that the means are not equal.

## The results of the ANOVA test indicate a significant difference in the process durations between the control and test groups (F(1, 18) = 38.86, p < 0.0001). Therefore, we reject the null hypothesis and conclude that the mean process duration for the control group differs from that of the test group.