In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
# Set a cleaner style
sns.set_theme(style="whitegrid", palette="muted")

import scipy.stats as stats


# 1. Load Dataset

In [None]:
experiment_df = pd.read_csv('test_table.csv')

In [None]:
experiment_df.head()

In [None]:
experiment_df.info()

In [None]:
for col in experiment_df.columns:
    uniques = experiment_df[col].unique()
    # print(f"{col:<20}{len(uniques):<10}{uniques[:5]}")
    print(f"{col:<20} Unique: {len(uniques):<5} | Sample: {str(uniques[:7])}")

In [None]:
experiment_df['date'] = pd.to_datetime(experiment_df['date'])

In [None]:
experiment_df.describe(include='all')

In [None]:
user_df = pd.read_csv('user_table.csv')

In [None]:
user_df.sample(3)

In [None]:
for col in user_df.columns:
    uniques = user_df[col].unique()
    # print(f"{col:<20}{len(uniques):<10}{uniques[:5]}")
    print(f"{col:<20} Unique: {len(uniques):<5} | Sample: {str(uniques[:7])}")

In [None]:
user_df.info()

In [None]:
user_df.describe(include='all')

In [None]:
df = experiment_df.merge(user_df, on='user_id', how='inner')

In [None]:
df.sample(4)

In [None]:
df.info()

# 2. EDA

In [None]:
# Define the list of columns, titles, and labels
features = [
    ('test', "Count plot of by Variant Group", "Variant Group"),
    ('country', "Count plot of Country by Variant Group", "Country"),
    ('device', "Count plot of Device by Variant Group", "Device"),
    ('browser', "Count plot of Browser by Variant Group", "Browser"),
    ('sex', "Count plot of Sex by Variant Group", "Sex"),
    ('ads_channel', "Count plot of Ads Channel by Variant Group", "Ads Channel"),
    ('browser_language', "Count plot of Browser Language by Variant Group", "Browser Language"),
    ('age', "Count plot of Age by Variant Group", "Age")
]

# Create subplots dynamically
fig, axes = plt.subplots(len(features), 1, figsize=(15, 32))

# Loop through each feature, title, and axis
for i, (col, title, xlabel) in enumerate(features):
    sns.countplot(data=df, x=col, hue='test', ax=axes[i])
    axes[i].set_title(title)
    axes[i].set_xlabel(xlabel)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# Create subplots to compare categorical and numerical features side by side
fig, axes = plt.subplots(8, 1, figsize=(15, 32))  # 8 rows, 1 column

# Define a list of tuples (x-variable, title, xlabel)
plot_settings = [
    ('test', "Conversion Rate by Variant Group", "Variant Group"),
    ('country', "Conversion Rate for Country by Variant Group", "Country"),
    ('device', "Conversion Rate for Device by Variant Group", "Device"),
    ('browser', "Conversion Rate for Browser by Variant Group", "Browser"),
    ('sex', "Conversion Rate for Sex by Variant Group", "Sex"),
    ('ads_channel', "Conversion Rate for Ads Channel by Variant Group", "Ads Channel"),
    ('browser_language', "Conversion Rate for Browser Language by Variant Group", "Browser Language"),
    ('age', "Conversion Rate for Age by Variant Group", "Age")
]

# Loop through the settings and create the plots
for i, (x_var, title, xlabel) in enumerate(plot_settings):
    sns.barplot(data=df, x=x_var, y='conversion', hue='test', ax=axes[i])
    axes[i].set_title(title)
    axes[i].set_xlabel(xlabel)
    axes[i].set_ylabel("Conversion Rate")

# Adjust layout
plt.tight_layout()
plt.show()


# 3. Statistical Testing

## a. T-Test


In [None]:
test_data = df[df['country'] != 'Spain']
trt_val = test_data[test_data['test'] == 1]['conversion'].values
ctrl_val = test_data[test_data['test'] == 0]['conversion'].values

In [None]:
t_stat, p = stats.ttest_ind(trt_val, ctrl_val, equal_var=False)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p}")

if p < 0.05:
    print("Reject the null hypothesis: Significant difference in conversion rates.")
else:
    print("Fail to reject the null hypothesis: No significant difference.")

The localized translations are underperforming significantly compared to the non-localized translations.

## b. Chi-square goodness-of-fit test

In [None]:
# Define observed counts and expected proportions
observed_counts = list(test_data.test.value_counts())  # Observed counts in groups A and B
expected_counts = [test_data.shape[0]/2, test_data.shape[0]/2]  # Expected 50/50 split

# Run the chi-square goodness-of-fit test
chi2_stat, p_value = stats.chisquare(f_obs=observed_counts, f_exp=expected_counts)

# Display the test result
print(chi2_stat, p_value)

The observed counts in the test groups are significantly different from the expected 50/50 split. This means the group distribution is not balanced as assumed.

In [None]:
# Group by 'date' and 'test' and count occurrences
data = test_data.groupby(['date', 'test']).size().reset_index(name='count')

# Plot using sns.lineplot
plt.figure(figsize=(15, 5))
sns.lineplot(data=data, x='date', y='count', hue='test', marker='o', linestyle='-', color='#F7B32B')
plt.title("Assignment")
plt.xlabel("Date")
plt.ylabel("Count")
plt.grid(True)
plt.show()


In [None]:
# # Create subplots to compare categorical and numerical features side by side
# fig, axes = plt.subplots(8, 1, figsize=(15, 32))  # 8 rows, 1 column

# sns.barplot(data=df, x='test', y='conversion', ax=axes[0])
# axes[0].set_title("Conversion Rate by Variant Group")
# axes[0].set_xlabel("Variant Group")
# axes[0].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='country', y='conversion', hue='test', ax=axes[1])
# axes[1].set_title("Conversion Rate for Country by Variant Group")
# axes[1].set_xlabel("Country")
# axes[1].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='device', y='conversion', hue='test', ax=axes[2])
# axes[2].set_title("Conversion Rate for Device by Variant Group")
# axes[2].set_xlabel("Device")
# axes[2].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='browser', y='conversion', hue='test', ax=axes[3])
# axes[3].set_title("Conversion Rate for Browser by Variant Group")
# axes[3].set_xlabel("Browser")
# axes[3].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='sex', y='conversion', hue='test', ax=axes[4])
# axes[4].set_title("Conversion Rate for Sex by Variant Group")
# axes[4].set_xlabel("Sex")
# axes[4].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='ads_channel', y='conversion', hue='test', ax=axes[5])
# axes[5].set_title("Conversion Rate for Ads Channel by Variant Group")
# axes[5].set_xlabel("Ads Channel")
# axes[5].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='browser_language', y='conversion', hue='test', ax=axes[6])
# axes[6].set_title("Conversion Rate for Browser Language by Variant Group")
# axes[6].set_xlabel("Browser Language")
# axes[6].set_ylabel("Conversion Rate")

# sns.barplot(data=df, x='age', y='conversion', hue='test', ax=axes[7])
# axes[7].set_title("Conversion Rate for Age by Variant Group")
# axes[7].set_xlabel("Age")
# axes[7].set_ylabel("Conversion Rate")

# # Adjust layout
# plt.tight_layout()
# plt.show()