# Dataset: Customer retention
### Hypothesis: “Customers who received a discount are more likely to return” 
### Conduct t-test to check significance 
### Report findings in plain English 

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
# Load dataset
path = "data.csv"
df = pd.read_csv(path)

In [None]:
df.head()

Unnamed: 0,custid,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city
0,6H6T6N,0,28-09-2012,11-08-2013,11-08-2013,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL
1,APCENR,1,19-12-2010,01-04-2011,19-01-2014,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL
2,7UP6MS,0,03-10-2010,01-12-2010,06-07-2011,0,0.0,0.0,33.58,0.059908,0,0,0,Wednesday,DEL
3,7ZEW8G,0,22-10-2010,28-03-2011,28-03-2011,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM
4,8V726M,1,27-11-2010,29-11-2010,28-01-2013,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM


In [None]:
df.describe()

Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep
count,30801.0,30801.0,30801.0,30801.0,30801.0,30801.0,30801.0,30801.0,30801.0
mean,0.79452,28.138405,25.554838,5.671139,61.873944,0.037729,0.649037,0.095094,0.038895
std,0.404059,16.75138,29.557106,10.561704,41.007588,0.103932,0.477279,0.29335,0.193347
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,16.0,2.040816,0.0,40.02,0.0,0.0,0.0,0.0
50%,1.0,32.0,13.207547,0.0,50.97,0.0,1.0,0.0,0.0
75%,1.0,42.0,40.0,7.142857,74.28,0.040816,1.0,0.0,0.0
max,1.0,291.0,100.0,100.0,2600.14,3.25,1.0,1.0,1.0


In [None]:
df.isnull().sum()

custid        20
retained       0
created       20
firstorder    20
lastorder     20
esent          0
eopenrate      0
eclickrate     0
avgorder       0
ordfreq        0
paperless      0
refill         0
doorstep       0
favday         0
city           0
dtype: int64

In [None]:
# Check column names
df.columns

Index(['custid', 'retained', 'created', 'firstorder', 'lastorder', 'esent',
       'eopenrate', 'eclickrate', 'avgorder', 'ordfreq', 'paperless', 'refill',
       'doorstep', 'favday', 'city'],
      dtype='object')

In [None]:
# overview
print(df.shape)
print(df.columns)

(30801, 15)
Index(['custid', 'retained', 'created', 'firstorder', 'lastorder', 'esent',
       'eopenrate', 'eclickrate', 'avgorder', 'ordfreq', 'paperless', 'refill',
       'doorstep', 'favday', 'city'],
      dtype='object')


In [None]:
# Retention counts
retention_counts = df["retained"].value_counts(dropna=False)
retention_counts

retained
1    24472
0     6329
Name: count, dtype: int64

In [None]:
# Split groups
retained = df[df["retained"] == 1]
not_retained = df[df["retained"] == 0]


In [None]:
# Compare average order value (avgorder)
retained_avg = retained["avgorder"].dropna()
not_retained_avg = not_retained["avgorder"].dropna()

In [None]:
print(retained_avg.mean(), not_retained_avg.mean())

61.957674893756135 61.55018802338442


In [None]:
# Welch's t-test for avgorder
avg_ttest = stats.ttest_ind(retained_avg, not_retained_avg, equal_var=False, nan_policy="omit")

In [None]:
avg_ttest

TtestResult(statistic=np.float64(0.6901421078050292), pvalue=np.float64(0.49012150644845764), df=np.float64(9601.352732263727))

In [None]:
df_welch = ((var1/n1 + var2/n2)**2) / (
    ((var1/n1)**2)/(n1-1) + ((var2/n2)**2)/(n2-1)
)

df_welch

  df_welch = ((var1/n1 + var2/n2)**2) / (


np.float64(nan)

In [None]:
retained_freq = retained["ordfreq"].dropna()
not_retained_freq = not_retained["ordfreq"].dropna()

In [None]:
freq_ttest = stats.ttest_ind(retained_freq, not_retained_freq, equal_var=False, nan_policy="omit")

In [None]:
retained_freq.mean(), not_retained_freq.mean(), freq_ttest

(np.float64(0.038298768468576334),
 np.float64(0.0355234908389951),
 TtestResult(statistic=np.float64(1.950582794390669), pvalue=np.float64(0.05113391459158596), df=np.float64(10248.596826812807)))