In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib.ticker import PercentFormatter

In [None]:
user = pd.read_csv("user_table.csv")
user.sample(3)

In [None]:
test = pd.read_csv("test_results.csv", parse_dates=['timestamp'])
test.sample(3)

In [None]:
# Find invalid rows
test[test["timestamp"].str.contains(r":60:", na=False)]

In [None]:
df = pd.merge(left=test, right=user, how='left', on='user_id')

In [None]:
df.info()

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [None]:
df.sample(3)

In [None]:
df[['test','price']].value_counts(normalize=True)

In [None]:
df = df[((df['test']==0) & (df['price']==39)) | ((df['test']==1) & (df['price']==59))]

In [None]:
df['revenue'] = df['converted'] * df['price']


In [None]:
df.isna().mean()

# Visualization

In [None]:
df.columns

## test

In [None]:
from scipy.stats import ttest_ind

In [None]:
df[['test', 'converted']].groupby('test').mean()

In [None]:
df[['test', 'revenue']].groupby('test').mean()

In [None]:
df.test.value_counts(normalize=True)

- Null hypothesis H0: experimental group's average revenue is equal to or less than control group's average revenue
- Alternative hypothesis H1: experimental group's average revenue is higher than control group's average revenue

In [None]:
ctr_val = df[df['test'] == 0]['revenue'].values
exp_val = df[df['test'] == 1]['revenue'].values
print(ttest_ind(ctr_val, exp_val, axis=0, equal_var=False))

Real-World Data Differences: In most real-world cases, groups often have different variances. For example, in your data, the users exposed to $39 might have a different variability in conversion or revenue than those exposed to $59.

In [None]:
from scipy.stats import levene
stat, p = levene(ctr_val, exp_val)
print("Levene's test p-value:", p)


If p > 0.05, you fail to reject the null hypothesis, and variances can be assumed equal.

**Why Use a t-test on Revenue?**
The t-test is used to compare the means of two groups. In this context:

- You are comparing average revenue between users exposed to the $39 price (control) and the $59 price (experiment).
- The t-test checks whether the difference in average revenue is statistically significant or could have occurred by chance.

**Why Use Welch’s t-test (equal_var=False)?**

Welch’s t-test (unequal variance) is more robust and commonly used when:

- Group sizes differ: The number of users in control (price $39) and experiment (price $59) groups may not be equal.
- Variances differ: The variability in revenue for users paying $39 might differ from those paying $59. For example:
    - $59 revenue can only come from users who converted, so the variability might be higher.
    - $39 has a wider potential distribution since it has more data points.


**Is This Test Two-Sided?**

By default, the ttest_ind function in scipy performs a two-sided test, which means:

- It tests for any difference in means, regardless of direction (greater or smaller).
- Null Hypothesis: The mean revenue of both groups is the same.
- Alternative Hypothesis: The mean revenue of both groups is different (could be greater or smaller).

If you want to test a specific direction (e.g., $59 leads to higher revenue), you can use a one-sided test by dividing the p-value by 2 and checking the t-statistic's sign.

**How to Interpret Results?**
- t-statistic: Indicates the direction and magnitude of the difference. Positive values suggest the second group (price $59) has higher mean revenue, while negative values suggest the opposite.
- p-value:
    - If p<0.05 (common threshold), reject the null hypothesis: The difference in revenue is statistically significant.
    - If p≥0.05, fail to reject the null hypothesis: The difference in revenue is not statistically significant.

In [None]:
# Recreate the groups for price 39 and 59
ctr = df[df['test'] == 0]
exp = df[df['test'] == 1]

# Calculate revenue for each group
revenue_ctr = ctr["price"] * ctr["converted"]
revenue_exp = exp["price"] * exp["converted"]

# Perform the t-test again
t_stat, p_value = ttest_ind(revenue_ctr, revenue_exp, equal_var=False)  # Assuming unequal variance

t_stat, p_value


- A negative t-statistic indicates that the mean revenue for the control group ($39 price) is higher than the experimental group ($59 price)*
- There is a statistically significant difference in the mean revenue between the two groups.
- Decision: It appears that selling the software at $59 may not be a good strategy, as it leads to significantly lower revenue compared to $39.

In [None]:
# One-sided test: Check if $59 price leads to higher revenue
if t_stat > 0:
    one_sided_p_value = p_value / 2
else:
    one_sided_p_value = 1 - (p_value / 2)

print("One-sided p-value:", one_sided_p_value)


In [None]:
# # Compute conversion counts for visualization
# conversion_counts = df.groupby(['price', 'converted']).size().unstack(fill_value=0)

# # Plot pie charts for each price group
# fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# for i, price in enumerate(conversion_counts.index):
#     axes[i].pie(
#         conversion_counts.loc[price],
#         labels=["Not Converted", "Converted"],
#         autopct='%1.1f%%',
#         startangle=90,
#         colors=['lightcoral', 'lightgreen']
#     )
#     axes[i].set_title(f"Conversion Rate for Price ${price}")

# plt.tight_layout()
# plt.show()

In [None]:
col_to_plot = "test"
data = (
    df.groupby([col_to_plot])
    .agg({"converted": "mean", "revenue": "sum", "user_id": "count"})
    .reset_index()
    .rename(columns={"converted": "conversion_rate", "user_id": "count"})
    .assign(avg_rev=lambda x: x["revenue"] / x["count"])
)
fig, ax= plt.subplots(1,3, figsize=(20,4))
sns.barplot(data=data, x=col_to_plot,y='count', ax=ax[0])
ax[0].set_title(f'Traffic by {col_to_plot}')
sns.barplot(data=data, x=col_to_plot,y='conversion_rate', ax=ax[1])
ax[1].set_title(f'Conversion Rate by {col_to_plot}')
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))
sns.barplot(data=data, x=col_to_plot,y='avg_rev', ax=ax[2])
ax[2].set_title(f'Revenue by {col_to_plot}')
plt.tight_layout()
plt.show()


## source

In [None]:
col_to_plot = "source"
data = (
    df.groupby(["test", col_to_plot])
    .agg({"converted": "mean", "revenue": "sum", "user_id": "count"})
    .reset_index()
    .rename(columns={"converted": "conversion_rate", "user_id": "count"})
    .assign(avg_rev=lambda x: x["revenue"] / x["count"])
)

fig, ax= plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot,y='count', hue='test', ax=ax[0])
ax[0].set_title(f'Traffic by {col_to_plot}')
sns.barplot(data=data, x=col_to_plot,y='conversion_rate', hue='test', ax=ax[1])
ax[1].set_title(f'Conversion Rate by {col_to_plot}')
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))
sns.barplot(data=data, x=col_to_plot,y='avg_rev', hue='test', ax=ax[2])
ax[2].set_title(f'Revenue by {col_to_plot}')
plt.tight_layout()
plt.show()


## operative_system

In [None]:
col_to_plot = "operative_system"
data = (
    df.groupby(["test", col_to_plot])
    .agg({"converted": "mean", "revenue": "sum", "user_id": "count"})
    .reset_index()
    .rename(columns={"converted": "conversion_rate", "user_id": "count"})
    .assign(avg_rev=lambda x: x["revenue"] / x["count"])
)

fig, ax= plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot,y='count', hue='test', ax=ax[0])
ax[0].set_title(f'Traffic by {col_to_plot}')
sns.barplot(data=data, x=col_to_plot,y='conversion_rate', hue='test', ax=ax[1])
ax[1].set_title(f'Conversion Rate by {col_to_plot}')
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))
sns.barplot(data=data, x=col_to_plot,y='avg_rev', hue='test', ax=ax[2])
ax[2].set_title(f'Revenue by {col_to_plot}')
plt.tight_layout()
plt.show()


## device

In [None]:
df.columns

In [None]:
col_to_plot = "device"
data = (
    df.groupby(["test", col_to_plot])
    .agg({"converted": "mean", "revenue": "sum", "user_id": "count"})
    .reset_index()
    .rename(columns={"converted": "conversion_rate", "user_id": "count"})
    .assign(avg_rev=lambda x: x["revenue"] / x["count"])
)

fig, ax= plt.subplots(1,3, figsize=(20,4))
sns.barplot(data=data, x=col_to_plot,y='count', hue='test', ax=ax[0])
ax[0].set_title(f'Traffic by {col_to_plot}')
sns.barplot(data=data, x=col_to_plot,y='conversion_rate', hue='test', ax=ax[1])
ax[1].set_title(f'Conversion Rate by {col_to_plot}')
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))
sns.barplot(data=data, x=col_to_plot,y='avg_rev', hue='test', ax=ax[2])
ax[2].set_title(f'Revenue by {col_to_plot}')
plt.tight_layout()
plt.show()


**Actionable Insights**
- Based on the analysis, recommend:
    - Whether the price increase leads to overall higher revenue.
    - Which user segments respond best or worst to the price change.
- Address potential trade-offs between higher revenue and lower conversion rates.