In [6]:
import pandas as pd
df_test = pd.read_csv('/kaggle/input/ab-testing-datasets/test_group.csv')
df_control = pd.read_csv('/kaggle/input/ab-testing-datasets/control_group.csv')

In [7]:
print(df_control.head())
print('-----------------------------')
print(df_control.info())
print('-----------------------------')
print(df_control.describe())

  Campaign Name;Date;Spend [USD];# of Impressions;Reach;# of Website Clicks;# of Searches;# of View Content;# of Add to Cart;# of Purchase
0  Control Campaign;1.08.2019;2280;82702;56930;70...                                                                                      
1  Control Campaign;2.08.2019;1757;121040;102513;...                                                                                      
2  Control Campaign;3.08.2019;2343;131711;110862;...                                                                                      
3  Control Campaign;4.08.2019;1940;72878;61235;30...                                                                                      
4             Control Campaign;5.08.2019;1835;;;;;;;                                                                                      
-----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 1 columns):
 #   Column                       

In [8]:
print(df_test.head())
print('-----------------------------')
print(df_test.info())
print('-----------------------------')
print(df_test.describe())

  Campaign Name;Date;Spend [USD];# of Impressions;Reach;# of Website Clicks;# of Searches;# of View Content;# of Add to Cart;# of Purchase
0  Test Campaign;1.08.2019;3008;39550;35820;3038;...                                                                                      
1  Test Campaign;2.08.2019;2542;100719;91236;4657...                                                                                      
2  Test Campaign;3.08.2019;2365;70263;45198;7885;...                                                                                      
3  Test Campaign;4.08.2019;2710;78451;25937;4216;...                                                                                      
4  Test Campaign;5.08.2019;2297;114295;95138;5863...                                                                                      
-----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 1 columns):
 #   Column                       

In [11]:
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

print("--- 1. Data Cleaning and Preparation ---")

try:
    df_test = pd.read_csv('/kaggle/input/ab-testing-datasets/test_group.csv', sep=';')
    df_control = pd.read_csv('/kaggle/input/ab-testing-datasets/control_group.csv', sep=';')
    print("CSV files loaded.")
except FileNotFoundError:
    print("CRITICAL ERROR: Make sure the file paths are correct.")
    print("'/kaggle/input/ab-testing-datasets/test_group.csv'")
    print("'/kaggle/input/ab-testing-datasets/control_group.csv'")
    exit()

if df_test.shape[1] < 2 or df_control.shape[1] < 2:
    print("\nCRITICAL ERROR: Data was not separated into columns correctly.")
    print("The program cannot continue. Please check the CSV file format and separator.")
    exit()
else:
    print("Data validation successful: Columns are correctly separated.\n")

df_combined = pd.concat([df_test, df_control], ignore_index=True)
print("Control and Test datasets combined.")

original_columns = df_combined.columns
df_combined.columns = df_combined.columns.str.strip() \
    .str.replace('# of ', '', regex=False) \
    .str.replace('\[USD\]', '', regex=True) \
    .str.replace(' ', '_')
print("Column names cleaned.")
print(f"Old columns: {list(original_columns)}")
print(f"New columns: {list(df_combined.columns)}\n")

print("Handling missing values...")
print(f"Rows before dropping NaN: {len(df_combined)}")
df_combined.dropna(inplace=True)
print(f"Rows after dropping NaN: {len(df_combined)}\n")

print("Converting data types...")
cols_to_convert = [col for col in df_combined.columns if col not in ['Campaign_Name', 'Date']]
for col in cols_to_convert:
    df_combined[col] = pd.to_numeric(df_combined[col])
print("Data types after conversion:")
print(df_combined.info())
print("\n" + "="*50 + "\n")

print("--- 2. Exploratory Analysis & Key Metrics ---")

campaign_summary = df_combined.groupby('Campaign_Name')[cols_to_convert].sum()
print("Overall Performance Metrics by Campaign:\n")
print(campaign_summary)
print("\n")

campaign_summary['Conversion_Rate'] = (campaign_summary['Purchase'] / campaign_summary['Reach']) * 100
campaign_summary['CPA'] = campaign_summary['Spend_'] / campaign_summary['Purchase']
campaign_summary['CTR'] = (campaign_summary['Website_Clicks'] / campaign_summary['Impressions']) * 100

print("Key Performance Indicators (KPIs) by Campaign:\n")
print(campaign_summary[['Conversion_Rate', 'CPA', 'CTR']].round(2))
print("\n" + "="*50 + "\n")

print("--- 3. Hypothesis Testing ---")

control_purchases = campaign_summary.loc['Control Campaign', 'Purchase']
control_reach = campaign_summary.loc['Control Campaign', 'Reach']
test_purchases = campaign_summary.loc['Test Campaign', 'Purchase']
test_reach = campaign_summary.loc['Test Campaign', 'Reach']

count = [test_purchases, control_purchases]
nobs = [test_reach, control_reach]

z_stat, p_value = proportions_ztest(count=count, nobs=nobs, alternative='larger')

print(f"Control Group: {int(control_purchases)} purchases from {int(control_reach)} users reached.")
print(f"Test Group:    {int(test_purchases)} purchases from {int(test_reach)} users reached.\n")
print(f"Z-statistic: {z_stat:.4f}")
print(f"P-value: {p_value:.10f}")
print("\n" + "="*50 + "\n")

print("--- 4. Interpretation and Final Recommendation ---")
alpha = 0.05

if p_value < alpha:
    print(f"Conclusion: Since the p-value ({p_value:.4f}) is less than our alpha ({alpha}), we REJECT the null hypothesis.")
    print("The observed increase in conversion rate for the Test Campaign is statistically significant.\n")
else:
    print(f"Conclusion: Since the p-value ({p_value:.4f}) is greater than our alpha ({alpha}), we FAIL TO REJECT the null hypothesis.")
    print("We do not have enough evidence to say the Test Campaign is significantly better.\n")


--- 1. Data Cleaning and Preparation ---
CSV files loaded.
Data validation successful: Columns are correctly separated.

Control and Test datasets combined.
Column names cleaned.
Old columns: ['Campaign Name', 'Date', 'Spend [USD]', '# of Impressions', 'Reach', '# of Website Clicks', '# of Searches', '# of View Content', '# of Add to Cart', '# of Purchase']
New columns: ['Campaign_Name', 'Date', 'Spend_', 'Impressions', 'Reach', 'Website_Clicks', 'Searches', 'View_Content', 'Add_to_Cart', 'Purchase']

Handling missing values...
Rows before dropping NaN: 60
Rows after dropping NaN: 59

Converting data types...
Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 0 to 59
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Campaign_Name   59 non-null     object 
 1   Date            59 non-null     object 
 2   Spend_          59 non-null     int64  
 3   Impressions     59 non-nul