## Import Libs and Dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import urllib

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
df = pd.read_excel(url)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 361878 entries, 0 to 541893
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    361878 non-null  object        
 1   StockCode    361878 non-null  object        
 2   Description  361878 non-null  object        
 3   Quantity     361878 non-null  int64         
 4   InvoiceDate  361878 non-null  datetime64[ns]
 5   UnitPrice    361878 non-null  float64       
 6   CustomerID   361878 non-null  float64       
 7   Country      361878 non-null  object        
 8   TotalAmount  361878 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 27.6+ MB


## Data Pre-processing

In [2]:
# Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Filter transactions for UK customers only
df = df[df['Country'] == 'United Kingdom']

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Add a new column for total amount
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


## Design A/B Testing

In [6]:
# Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Filter transactions for UK customers only
df = df[df['Country'] == 'United Kingdom']

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Add a new column for total amount
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


## Define A/B Testing

In [8]:
# Randomly assign customers to control (A) or treatment (B) groups
np.random.seed(42)
df['Group'] = np.random.choice(['A', 'B'], size=len(df))

# Calculate total amount spent by each customer
customer_sales = df.groupby(['CustomerID', 'Group'])['TotalAmount'].sum().reset_index()

# Separate the control and treatment groups
control_group = customer_sales[customer_sales['Group'] == 'A']
treatment_group = customer_sales[customer_sales['Group'] == 'B']

# Display the first few rows of each group
control_group.head(), treatment_group.head()

(   CustomerID Group  TotalAmount
 0     12346.0     A    -77183.60
 2     12747.0     A      1684.68
 4     12748.0     A     14970.01
 6     12749.0     A      1928.06
 8     12820.0     A       551.18,
    CustomerID Group  TotalAmount
 1     12346.0     B     77183.60
 3     12747.0     B      2511.33
 5     12748.0     B     14102.09
 7     12749.0     B      1940.14
 9     12820.0     B       391.16)

## Analyze Results

In [9]:
# Calculate the mean and standard deviation of total amount spent for both groups
mean_control = control_group['TotalAmount'].mean()
std_control = control_group['TotalAmount'].std()

mean_treatment = treatment_group['TotalAmount'].mean()
std_treatment = treatment_group['TotalAmount'].std()

print(f'Mean Total Amount (Control): {mean_control}')
print(f'Std Total Amount (Control): {std_control}')
print(f'Mean Total Amount (Treatment): {mean_treatment}')
print(f'Std Total Amount (Treatment): {std_treatment}')

# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(control_group['TotalAmount'], treatment_group['TotalAmount'])

print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')

Mean Total Amount (Control): 901.435618668038
Std Total Amount (Control): 4660.865253635615
Mean Total Amount (Treatment): 837.53280436457
Std Total Amount (Treatment): 4254.270019732484
T-statistic: 0.6317676296433713
P-value: 0.5275572863569251


In [10]:
alpha = 0.05

if p_value < alpha:
    print("The difference between the control and treatment groups is statistically significant.")
else:
    print("The difference between the control and treatment groups is not statistically significant.")

The difference between the control and treatment groups is not statistically significant.
