In [19]:
import pandas as pd
df = pd.read_csv('referral.csv')

In [3]:
df.head()

Unnamed: 0,user_id,date,country,money_spent,is_referral,device_id
0,2,2015-10-03,FR,65,0,EVDCJTZMVMJDG
1,3,2015-10-03,CA,54,0,WUBZFTVKXGQQX
2,6,2015-10-03,FR,35,0,CBAPCJRTFNUJG
3,7,2015-10-03,UK,73,0,PRGXJZAJKMXRH
4,7,2015-10-03,MX,35,0,PRGXJZAJKMXRH


In [5]:
def count_spent(df):
    d = {}
    d['n_purchase'] = df.shape[0]# number of purchase in that day
    d['total_spent'] = df.money_spent.sum() # total money spent in that day
    d['n_customer'] = df.user_id.unique().shape[0] # how many customers access the store that day
    return pd.Series(d)

grpby_day = df.groupby('date').apply(count_spent)

In [10]:
df.country.value_counts()

UK    15493
FR    15396
US    15280
IT    11446
DE    11093
ES     9831
CA     9440
MX     8133
CH     1229
Name: country, dtype: int64

In [20]:
df = df.drop(['device_id'],axis=1)

In [21]:
import datetime
import scipy.stats as ss

df['date'] = pd.to_datetime( df.date )
def daily_statistics(df):
    """
    given a dataframe
    1.  group by day, and return '#purchase','total spent money','#customers' on each day
    2.  split daily data into two groups, before the program and after the program
    3.  for each 'sale index' ('#purchase','total spent money','#customers'), 
        calculate the mean before/after the program, their difference, and pvalue 
    """
    grpby_day = df.groupby('date').apply(count_spent)

    grpby_day_before = grpby_day.loc[grpby_day.index < dt_referral_starts, :]
    grpby_day_after = grpby_day.loc[grpby_day.index >= dt_referral_starts, :]

    d = []
    colnames = ['total_spent','n_purchase','n_customer']
    for col in colnames:
        pre_data = grpby_day_before.loc[:,col]
        pre_mean = pre_data.mean()

        post_data = grpby_day_after.loc[:,col]
        post_mean = post_data.mean()

        result = ss.ttest_ind(pre_data, post_data, equal_var=False)#this is 2 tailed test so if we want to know 
        #if mean1<menan2 instead of mean1!=mean2, we have to use one tail tets, the p value will be p/2
        #because this is a symmtery distribution
        # either greater or smaller, just one-tail test
        pvalue = result.pvalue / 2 

        d.append({'mean_pre':pre_mean,'mean_post':post_mean,'mean_diff':post_mean - pre_mean,
                  'pvalue':pvalue})

    # re-order the columns
    return pd.DataFrame(d,index = colnames).loc[:,['mean_pre','mean_post','mean_diff','pvalue']]
dt_referral_starts = datetime.datetime(2015,10,31)
daily_statistics(df)

Unnamed: 0,mean_pre,mean_post,mean_diff,pvalue
total_spent,71657.0,83714.392857,12057.392857,0.135194
n_purchase,1690.75,1785.714286,94.964286,0.348257
n_customer,1384.464286,1686.964286,302.5,0.059545
