In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind


In [2]:
# Load the dataset
# Update the file name/path if needed
df = pd.read_csv("customer_retention.csv")

# Preview the data
df.head()


Unnamed: 0,customer_id,discount_received,returned
0,1,1,1
1,2,1,1
2,3,1,0
3,4,1,1
4,5,1,1


In [3]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   customer_id        30 non-null     int64
 1   discount_received  30 non-null     int64
 2   returned           30 non-null     int64
dtypes: int64(3)
memory usage: 852.0 bytes


Unnamed: 0,customer_id,discount_received,returned
count,30.0,30.0,30.0
mean,15.5,0.5,0.5
std,8.803408,0.508548,0.508548
min,1.0,0.0,0.0
25%,8.25,0.0,0.0
50%,15.5,0.5,0.5
75%,22.75,1.0,1.0
max,30.0,1.0,1.0


In [4]:
# Customers who received a discount
discount_group = df[df["discount_received"] == 1]["returned"]

# Customers who did NOT receive a discount
no_discount_group = df[df["discount_received"] == 0]["returned"]

# Sample sizes
len(discount_group), len(no_discount_group)


(15, 15)

In [5]:
discount_group.mean(), no_discount_group.mean()


(0.7333333333333333, 0.26666666666666666)

In [6]:
t_stat, p_value = ttest_ind(
    discount_group,
    no_discount_group,
    equal_var=False  # Welchâ€™s t-test (safer when variances differ)
)

t_stat, p_value


(2.792034123261129, 0.009332430702599726)

In [7]:
alpha = 0.05

if p_value < alpha:
    print("The result is statistically significant.")
else:
    print("The result is NOT statistically significant.")


The result is statistically significant.


In [8]:
print(f"""
Customers who received a discount had an average return rate of {discount_group.mean():.2f}.
Customers who did not receive a discount had an average return rate of {no_discount_group.mean():.2f}.

The t-test produced a p-value of {p_value:.4f}.

At a 5% significance level, this means we {'reject' if p_value < 0.05 else 'fail to reject'} the null hypothesis.
""")



Customers who received a discount had an average return rate of 0.73.
Customers who did not receive a discount had an average return rate of 0.27.

The t-test produced a p-value of 0.0093.

At a 5% significance level, this means we reject the null hypothesis.

