In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [None]:
URL_BASE = 'https://raw.githubusercontent.com/ab-courses/simulator-ab-datasets/main/2022-04-01/'
def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))


In [None]:
df_sales = read_database('2022-04-01T12_df_sales.csv')
df_web_logs = read_database('2022-04-01T12_df_web_logs.csv')
df_sales['date'] = pd.to_datetime(df_sales['date'])
df_web_logs['date'] = pd.to_datetime(df_web_logs['date'])

In [None]:
df_sales.info()


In [None]:
df_sales.head()

In [None]:
df_sales.describe()

In [None]:
df_web_logs.head()

In [None]:
df_web_logs.info()

In [None]:
df_web_logs.describe()

### Merging tables

How often does a user visit the site and not buy anything that day?

In [None]:
df_web_logs['day']=df_web_logs['date'].dt.date
df_sales['day']=df_sales['date'].dt.date

In [None]:
df_web_logs.head(2)

In [None]:
user_day_web=df_web_logs[['user_id','day','page']].groupby(['user_id','day']).count()
user_day_sale=df_sales[['user_id','day','price']].groupby(['user_id','day']).sum()
user_day_web.head(2)

In [None]:
user_day_sale.head(2)

In [None]:
df2=pd.merge(
    user_day_web,
    user_day_sale,
    how='outer',
    left_index=True,
    right_index=True
    )

In [None]:
df2.head()

In [None]:
df2.isnull().mean()
# 33% of customers visited the website, but didn't buy anything 

### Tasks

In [None]:
#40% of orders have a value greater than 1000  each. Suppose a new order costs more than 1000  with probability 0.4. 
#Estimate the probability that out of 100 new orders 50 or more cost more than 1000.
values=np.random.binomial(100,0.4,size=1000000)
answer=(values>=50).mean()
print('answer',round(answer,3))


In [None]:
plt.hist(values)

#### Identify the top 3 products by total revenue.

In [None]:

URL_BASE = 'https://raw.githubusercontent.com/ab-courses/simulator-ab-datasets/main/2022-04-01/'
def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

df_sales_detail = read_database('2022-04-01T12_df_sales_detail.csv')
df_sales_detail['date']=pd.to_datetime(df_sales_detail['date'])



In [None]:
df_sales_detail.head(2)

In [None]:
top_revenue=(
    df_sales_detail.groupby('good')['price'].sum().sort_values(ascending=False).iloc[:3].index.tolist()
    ) 
print("top 3 goods by revenue",top_revenue)

### Estimate the average time that passes from a user entering the site to making a purchase. 
We will assume that a visit to the site is a purchase if it was made not earlier than two hours before the purchase.

In [None]:
from datetime import timedelta

In [None]:
df3=pd.merge(
    df_sales[['user_id','date']].rename(columns={'date':'date_sale'}),
    df_web_logs[['user_id','date']].rename(columns={'date':'date_enter'}),
    on='user_id',
    how='inner'
)

In [None]:
df3.head()

In [None]:
# delete rows where customer visited site 2 hours (& more) before bying 
df3=df3[
    (df3['date_enter']>=df3['date_sale']-timedelta(hours=2)) & (df3['date_enter']<df3['date_sale'])
].copy()

In [None]:
# time of the first visit
df3_first_visit=df3.groupby(['user_id','date_sale'])[['date_enter']].min().reset_index()
# calculate time in seconds
df3_first_visit['delta']=(df3_first_visit['date_sale']-df3_first_visit['date_enter']).dt.total_seconds()
# calculate average time in minutes 

answer=round(df3_first_visit['delta'].mean()/60)
print('average time that passes from a user entering the site to making a purchase:',answer, 'min')

In [None]:
### function to get data 
def get_data_subset(df, begin_date, end_date, user_ids=None, columns=None):
    if begin_date:
        df = df[df['date'] >= begin_date]
    if end_date:
        df = df[df['date'] < end_date]
    if user_ids:
        df = df[df['user_id'].isin(user_ids)]
    if columns:
        df = df[columns]
    return df.copy()

In [None]:
from datetime import datetime

In [None]:
df_example=get_data_subset(df_sales,datetime(2022,3,1),datetime(2022,4,23))
df_example.head()

In [None]:
def get_response_time(df_web_logs, begin_date, end_date):
    return (
        get_data_subset(df_web_logs, begin_date, end_date, None, ['user_id', 'load_time'])
        .rename(columns={'load_time': 'metric'})
        [['user_id', 'metric']]
    )

    
df_example2=get_response_time(df_web_logs,datetime(2022,3,1),datetime(2022,4,23))
df_example2.head()

In [None]:
def get_revenue_web(df_sales, df_web_logs, begin_date, end_date):
    df_users = (
        get_data_subset(df_web_logs, begin_date, end_date, None, ['user_id'])
        .drop_duplicates()
    )
    df = (
        get_data_subset(df_sales, begin_date, end_date, None, ['user_id', 'price'])
        .groupby('user_id')[['price']].sum().reset_index() 
        .rename(columns={'price': 'metric'})
    )
    df = pd.merge(df_users, df, on='user_id', how='left').fillna(0)
    return df[['user_id', 'metric']]

In [None]:
df_example3=get_revenue_web(df_sales, df_web_logs,datetime(2022,3,1),datetime(2022,4,23))
df_example3.head()

In [None]:
# get revenue for all users 

def get_revenue_all(df_sales, df_web_logs, begin_date, end_date):
    df_users = (
        get_data_subset(df_web_logs, None, end_date, None, ['user_id'])
        .drop_duplicates()
    )
    df = (
        get_data_subset(df_sales, begin_date, end_date, None, ['user_id', 'price'])
        .groupby('user_id')[['price']].sum().reset_index() 
        .rename(columns={'price': 'metric'})
    )
    df = pd.merge(df_users, df, on='user_id', how='left').fillna(0)
    return df[['user_id', 'metric']]

In [None]:
df_example4=get_revenue_all(df_sales, df_web_logs,datetime(2022,3,1),datetime(2022,4,23))
df_example4.head()

## Hypothesis testing

### Student's t-distribution

In [None]:
# Student's t-distribution
X = np.linspace(-5,5,100)
list_k=[1,2,5,20]
for k in list_k:
    Y=stats.t.pdf(X,k)
    plt.plot(X,Y,label=f'k={k}')
Y=stats.norm.pdf(X,0,1)
plt.plot(X,Y,'--',label='norm', linewidth=4)
plt.legend()
plt.title('Students t-distribution')
plt.xlabel('x')
plt.grid()
plt.show()


### Student's test 

In [None]:
def get_student_parameter(data_one: np.array, data_two: np.array) -> float:
    """Calculates the parameter of the Student distribution function."""
    len_one, len_two = len(data_one), len(data_one)
    std_one, std_two = np.std(data_one), np.std(data_two)
    k = (
        ((std_one ** 2) / len_one + (std_two ** 2) / len_two) ** 2
        / (
            (std_one ** 4) / ((len_one ** 2) * (len_one - 1))
            + (std_two ** 4) / ((len_two ** 2) * (len_two - 1))
        )
    )
    return k


def get_ttest_statistic(data_one: np.array, data_two: np.array) -> float:
    """Calculates Student's statistic"""
    len_one, len_two = len(data_one), len(data_two)
    mean_one, mean_two = np.mean(data_one), np.mean(data_two)
    std_one, std_two = np.std(data_one, ddof=1), np.std(data_two, ddof=1)
    t = (mean_two - mean_one) / np.sqrt((std_two**2 / len_two) + (std_one**2 / len_one))
    return t

In [None]:
#Compare the results with the finished implementation of the test
sample_size = 100
alpha = 0.05

np.random.seed(44)
data_control = np.random.normal(1, 1, sample_size)
data_pilot = np.random.normal(1.2, 1, sample_size)

k = get_student_parameter(data_control, data_pilot)
print(f'student_parameter = {k}')

critical_region_bounds = stats.t.ppf([alpha/2, 1 - alpha/2], df=k)
print(f'critical_region_bounds = {critical_region_bounds}')

ttest_statistic = get_ttest_statistic(data_control, data_pilot)
print(f'ttest_statistic = {ttest_statistic}')

pvalue = stats.t.cdf(-np.abs(ttest_statistic), df=k) * 2
print(f'pvalue = {pvalue}')
ttest_statistic_real, pvalue_real = stats.ttest_ind(data_pilot, data_control)
print(f'stats.ttest_ind = {stats.ttest_ind(data_pilot, data_control)}')
assert np.isclose(ttest_statistic_real, ttest_statistic, 1e-9), 'Calculation is correct'


In [None]:
#critical areas of the criterion
X = np.linspace(-3, 3, 1000)
Y = stats.t.pdf(X, k)

plt.plot(X, Y, label=f'St (k={k:0.0f})')

critical_mask_list = [X < critical_region_bounds[0], X > critical_region_bounds[1]]
for idx, mask in enumerate(critical_mask_list):
    X_ = X[mask]
    Y_upper = Y[mask]
    Y_down = np.zeros(len(Y_upper))
    plt.fill_between(
        X_, Y_down, Y_upper,
        color='r', alpha=0.3,
        label='critical region' if idx==0 else ''
    )

plt.scatter([ttest_statistic], [0], color='k', label='t-statistic')

plt.title('Distribution of statistics for H0')
plt.xlabel('t')
plt.legend()
plt.grid()
plt.show()


### Two-sided and one-sided criteria

In [None]:
#Let's build the critical region of the one-sided criterion.
sample_size = 100
alpha = 0.05

k = get_student_parameter(data_control, data_pilot)
critical_region_bound = stats.t.ppf([1 - alpha], df=k)


X = np.linspace(-3, 3, 1000)
Y = stats.t.pdf(X, k)

plt.plot(X, Y, label=f'St (k={k:0.0f})')

mask = X > critical_region_bound
X_ = X[mask]
Y_upper = Y[mask]
Y_down = np.zeros(len(Y_upper))
plt.fill_between(
    X_, Y_down, Y_upper,
    color='r', alpha=0.3,
    label='critical region'
)

plt.scatter([ttest_statistic], [0], color='k', label='t-statistic')

plt.title('Distribution of statistics for H0')
plt.xlabel('t')
plt.legend()
plt.grid()
plt.show()

### P-value 

In [None]:
x=np.random.normal(0,1,100)
y=np.random.normal(0.4,1,100)
stats.ttest_ind(x,y)