In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url_control = '/Users/amirrezakamkar/Desktop/Works_in_progress/Ironhack/5_5th_week/Project/2nd_project/data/clean/combined_data_control.txt'
df_control = pd.read_csv(url_control)
url_test = '/Users/amirrezakamkar/Desktop/Works_in_progress/Ironhack/5_5th_week/Project/2nd_project/data/clean/combined_data_test.txt'
df_test = pd.read_csv(url_test)

In [4]:
# Concatenate the DataFrames
merged_df = pd.concat([df_test, df_control], ignore_index=True)

In [5]:
merged_df.tail()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
317118,9998346,292425655_16607136645,189177304_69869411700_783154,step_2,2017-03-29 15:27:40,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control
317119,9998346,292425655_16607136645,189177304_69869411700_783154,step_2,2017-03-29 15:26:47,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control
317120,9998346,292425655_16607136645,189177304_69869411700_783154,step_1,2017-03-29 15:26:02,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control
317121,9998346,292425655_16607136645,189177304_69869411700_783154,start,2017-03-29 15:25:00,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control
317122,9998921,960034051_85687824805,378424178_59565976155_8051,start,2017-04-18 11:26:31,15.0,189.0,38.0,F,2.0,30644.1,4.0,7.0,Control


In [6]:
merged_df.isnull().sum()
df = merged_df

In [7]:
# test group completion rate (see test group notebook):
# Total number of visitors: 37122
# Number of visitors who reached the 'confirm' step: 21725
# Completion rate: 58.52%

# control group completion rate (see control group notebook): 
# Total number of visitors: 32181
# Number of visitors who reached the 'confirm' step: 16039
# Completion rate: 49.84%

#Null Hypothesis (H0): There is no difference in completion rates between the two groups.
#Alternative Hypothesis (H1): There is a difference in completion rates between the two groups.

# alpha = 0.05

from statsmodels.stats.proportion import proportions_ztest

# n1= test group
# n2= control group

# Define the data
n1 = 37122
n2 = 32181
x1 = 21725
x2 = 16039

# Perform the two-proportion z-test
count = np.array([x1, x2])
nobs = np.array([n1, n2])

z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')

# Output the results
print(f"Z-Statistic: {z_stat:.4f}, P-Value: {p_value}")


Z-Statistic: 22.8935, P-Value: 5.392475122751855e-116


In [8]:
# The P-value is a much smaller than the alpha, so we reject the null hypothesis and conclude that the difference
# in completion rate is statistically significant.

In [9]:
# Average time spent on each step for test and control group.

# Ensure date_time is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort the data for proper time difference calculation
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Calculate time spent in seconds for each step
df['time_spent'] = df.groupby(['client_id', 'visitor_id', 'visit_id'])['date_time'].diff().dt.total_seconds()

# Drop NaN values (first step has no previous step for time difference calculation)
df = df.dropna(subset=['time_spent'])

# Separate the data for test and control groups
df_test = df[df['Variation'] == 'Test']
df_control = df[df['Variation'] == 'Control']

# Compute mean time spent for each step in the test group
mean_time_test = df_test.groupby('process_step')['time_spent'].mean().reset_index()
mean_time_test.rename(columns={'time_spent': 'mean_time_spent_test'}, inplace=True)

# Compute mean time spent for each step in the control group
mean_time_control = df_control.groupby('process_step')['time_spent'].mean().reset_index()
mean_time_control.rename(columns={'time_spent': 'mean_time_spent_control'}, inplace=True)

# Merge the two DataFrames for comparison
mean_time_comparison = pd.merge(mean_time_test, mean_time_control, on='process_step', how='outer')

# Display the resulting DataFrame
mean_time_comparison


Unnamed: 0,process_step,mean_time_spent_test,mean_time_spent_control
0,confirm,129.369646,129.959073
1,start,151.698666,174.742134
2,step_1,37.761981,43.186201
3,step_2,48.221799,38.844172
4,step_3,97.610939,94.170056


In [10]:
# Hypothesis test for the time spent on each step in control and test group.

from scipy import stats

# Null hypothesis (H0): The mean time spent by the test group is greater than or equal to the mean time spent by the control group.
# Alternative hypothesis (H1): The mean time spent by the test group is less than the mean time spent by the control group.
# alpha = 0.05

# Step 1: Convert 'date_time' to datetime
df['date_time'] = pd.to_datetime(df['date_time'])

# Step 2: Sort the data for proper time difference calculation
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Step 3: Filter for first 'start' and last 'confirm' times
df_start = (
    df[df['process_step'] == 'start']
    .groupby(['client_id', 'visitor_id', 'visit_id'])
    .first()
    .reset_index()
)

df_confirm = (
    df[df['process_step'] == 'confirm']
    .groupby(['client_id', 'visitor_id', 'visit_id'])
    .last()
    .reset_index()
)

# Combine 'start' and 'confirm' for time difference calculation
df_steps = pd.merge(df_start, df_confirm, on=['client_id', 'visitor_id', 'visit_id', 'Variation'], suffixes=('_start', '_confirm'))

# Calculate time spent between 'start' and 'confirm'
df_steps['time_spent_start_to_confirm'] = (df_steps['date_time_confirm'] - df_steps['date_time_start']).dt.total_seconds()

# Step 4: Retain intermediate steps and calculate their time spent
df['time_spent'] = df.groupby(['client_id', 'visitor_id', 'visit_id'])['date_time'].diff().dt.total_seconds()

# Drop NaN values (first step has no previous step for time difference calculation)
df_intermediate = df.dropna(subset=['time_spent'])

# Step 5: Separate data for Test and Control groups for each step
df_test = df_intermediate[df_intermediate['Variation'] == 'Test']
df_control = df_intermediate[df_intermediate['Variation'] == 'Control']

# Initialize a list to store T-test results
t_test_results = []

# Perform T-test for each process step
for step in df['process_step'].unique():
    # Filter time_spent for current step
    test_times = df_test[df_test['process_step'] == step]['time_spent']
    control_times = df_control[df_control['process_step'] == step]['time_spent']
    
    # Perform the Two Sample T-test
    t_stat, p_value = stats.ttest_ind(test_times, control_times, equal_var=False)
    
    # Adjust p-value for one-sided test
    if t_stat < 0:  # If the test statistic is negative, the test group has lower time_spent
        p_value_one_sided = p_value / 2
    else:
        p_value_one_sided = 1 - (p_value / 2)
    
    # Append results to the list
    t_test_results.append({
        'process_step': step,
        't_statistic': t_stat,
        'p_value': round(p_value_one_sided, 3),
        'hypothesis': 'Test group has lower time_spent' if p_value_one_sided < 0.05 else 'No significant difference'
    })

# Step 6: Add the start-to-confirm comparison
test_times_confirm = df_steps[df_steps['Variation'] == 'Test']['time_spent_start_to_confirm']
control_times_confirm = df_steps[df_steps['Variation'] == 'Control']['time_spent_start_to_confirm']

t_stat_confirm, p_value_confirm = stats.ttest_ind(test_times_confirm, control_times_confirm, equal_var=False)

# Adjust p-value for one-sided test
if t_stat_confirm < 0:
    p_value_one_sided_confirm = p_value_confirm / 2
else:
    p_value_one_sided_confirm = 1 - (p_value_confirm / 2)

# Append start-to-confirm results
t_test_results.append({
    'process_step': 'start_to_confirm',
    't_statistic': t_stat_confirm,
    'p_value': round(p_value_one_sided_confirm, 3),
    'hypothesis': 'Test group has lower time_spent' if p_value_one_sided_confirm < 0.05 else 'No significant difference'
})

# Step 7: Convert the list of results into a DataFrame
t_test_results_df = pd.DataFrame(t_test_results)

# Display the results
t_test_results_df


Unnamed: 0,process_step,t_statistic,p_value,hypothesis
0,step_1,-2.53948,0.006,Test group has lower time_spent
1,step_2,13.221018,1.0,No significant difference
2,step_3,3.634967,1.0,No significant difference
3,confirm,-1.214722,0.112,No significant difference
4,start,-6.265336,0.0,Test group has lower time_spent
5,start_to_confirm,8.766045,1.0,No significant difference


In [11]:
# The whole process duration, from start to confirm for those clients who finished the process.

# Ensure date_time is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort the data for proper time calculations
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Extract first 'start' and last 'confirm' for each client, visitor, and visit
start_times = (
    df[df['process_step'] == 'start']
    .groupby(['client_id', 'visitor_id', 'visit_id'])['date_time']
    .first()
    .reset_index(name='start_time')
)

confirm_times = (
    df[df['process_step'] == 'confirm']
    .groupby(['client_id', 'visitor_id', 'visit_id'])['date_time']
    .last()
    .reset_index(name='confirm_time')
)

# Merge start and confirm times
process_times = pd.merge(start_times, confirm_times, on=['client_id', 'visitor_id', 'visit_id'], how='inner')

process_times = process_times.dropna()

# Calculate total time spent in seconds
process_times['total_time_spent'] = (process_times['confirm_time'] - process_times['start_time']).dt.total_seconds()

# Merge with the original dataset to get the Variation (Test/Control)
process_times = pd.merge(process_times, df[['client_id', 'visitor_id', 'visit_id', 'Variation']].drop_duplicates(), 
                         on=['client_id', 'visitor_id', 'visit_id'], how='left')

# Calculate the mean time spent for each group
mean_time_per_group = process_times.groupby('Variation')['total_time_spent'].mean().reset_index()

# Rename columns for clarity
mean_time_per_group.columns = ['Variation', 'mean_total_time_spent']

# Display the results
print(mean_time_per_group)


  Variation  mean_total_time_spent
0   Control             346.583275
1      Test             474.758110


In [12]:
# Hypothesis test for the the whole process duration.

from scipy import stats

# Ensure date_time is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Sort the data for proper time calculations
df = df.sort_values(by=['client_id', 'visitor_id', 'visit_id', 'date_time'])

# Extract first 'start' and last 'confirm' for each client, visitor, and visit
start_times = (
    df[df['process_step'] == 'start']
    .groupby(['client_id', 'visitor_id', 'visit_id'])['date_time']
    .first()
    .reset_index(name='start_time')
)

confirm_times = (
    df[df['process_step'] == 'confirm']
    .groupby(['client_id', 'visitor_id', 'visit_id'])['date_time']
    .last()
    .reset_index(name='confirm_time')
)

# Merge start and confirm times
process_times = pd.merge(start_times, confirm_times, on=['client_id', 'visitor_id', 'visit_id'], how='inner')

# Calculate total time spent in seconds
process_times['total_time_spent'] = (process_times['confirm_time'] - process_times['start_time']).dt.total_seconds()

# Merge with the original dataset to get the Variation (Test/Control)
process_times = pd.merge(process_times, df[['client_id', 'visitor_id', 'visit_id', 'Variation']].drop_duplicates(), 
                         on=['client_id', 'visitor_id', 'visit_id'], how='left')

process_times = process_times.dropna()

# Separate Test and Control groups
test_times = process_times[process_times['Variation'] == 'Test']['total_time_spent']
control_times = process_times[process_times['Variation'] == 'Control']['total_time_spent']

# Perform the Two-Sample T-test (Unequal variance assumed)
t_stat, p_value = stats.ttest_ind(test_times, control_times, equal_var=False)

# Output results
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.4f}")
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in the total time spent.")
else:
    print("Fail to reject the null hypothesis: No significant difference in the total time spent.")


T-Statistic: 8.7660
P-Value: 0.0000
Reject the null hypothesis: There is a significant difference in the total time spent.


In [21]:
df.shape

(247698, 15)

In [26]:
df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,time_spent
176638,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,
176637,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,32.0
176636,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,99.0
176640,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,20.0
176633,647,66758770_53988066587,40369564_40101682850_311847,step_1,2017-04-12 15:41:35,12.0,151.0,57.5,M,2.0,30525.8,0.0,4.0,Test,


In [28]:
df.isnull().sum()

client_id               0
visitor_id              0
visit_id                0
process_step            0
date_time               0
clnt_tenure_yr          0
clnt_tenure_mnth        0
clnt_age                0
gendr                   0
num_accts               0
bal                     0
calls_6_mnth            0
logons_6_mnth           0
Variation               0
time_spent          58029
dtype: int64

In [30]:
df = df.dropna()

In [32]:
df.isnull().sum()

client_id           0
visitor_id          0
visit_id            0
process_step        0
date_time           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
Variation           0
time_spent          0
dtype: int64

In [34]:
df.duplicated().sum()

0

In [36]:
df.to_csv('amirreza_complete_dataframe.csv', index=False) 