In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

In [3]:
email_opened = pd.read_csv("email_opened_table.csv")
email_table = pd.read_csv("email_table.csv")
link_clicked = pd.read_csv("link_clicked_table.csv")

In [4]:
if not email_opened['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in email_opened_table not found in email_table")
if not link_clicked['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in link_clicked_table not found in email_table")

In [5]:
print("\nMissing values in email_table:\n", email_table.isnull().sum())
print("\nMissing values in email_opened:\n", email_opened.isnull().sum())
print("\nMissing values in link_clicked:\n", link_clicked.isnull().sum())


Missing values in email_table:
 email_id               0
email_text             0
email_version          0
hour                   0
weekday                0
user_country           0
user_past_purchases    0
dtype: int64

Missing values in email_opened:
 email_id    0
dtype: int64

Missing values in link_clicked:
 email_id    0
dtype: int64


In [6]:
email_table['opened'] = email_table['email_id'].isin(email_opened['email_id']).astype(int)
email_table['clicked'] = email_table['email_id'].isin(link_clicked['email_id']).astype(int)

In [8]:
total_emails = len(email_table)
emails_opened = email_table['opened'].sum()
emails_clicked = email_table['clicked'].sum()
open_rate = (emails_opened / total_emails) * 100
ctr = (emails_clicked / total_emails) * 100

print(f"\nTotal Emails Sent: {total_emails}")
print(f"Emails Opened: {emails_opened}")
print(f"Emails Clicked: {emails_clicked}")
print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {ctr:.2f}%")


Total Emails Sent: 100000
Emails Opened: 10345
Emails Clicked: 2119
Open Rate: 10.35%
Click-Through Rate: 2.12%


In [9]:
def plot_bar(feature, target):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature, y=target, data=email_table)
    plt.title(f'{target.capitalize()} Rate by {feature.capitalize()}')
    plt.xticks(rotation=45)
    plt.savefig(f'{target}_by_{feature}.png')
    plt.close()

In [10]:
features_to_plot = ['email_text', 'email_version', 'user_country', 'weekday', 'user_past_purchases']
for feature in features_to_plot:
    plot_bar(feature, 'opened')
    plot_bar(feature, 'clicked')

In [11]:
email_table['hour'] = email_table['hour'].astype(int)
plot_bar('hour', 'opened')
plot_bar('hour', 'clicked')

In [12]:
print("\nClick Rate by Segment:")
for feature in features_to_plot + ['hour']:
    print(f"\n{feature}:")
    print(email_table.groupby(feature)['clicked'].mean() * 100)


Click Rate by Segment:

email_text:
email_text
long_email     1.853767
short_email    2.387177
Name: clicked, dtype: float64

email_version:
email_version
generic         1.513673
personalized    2.729409
Name: clicked, dtype: float64

user_country:
user_country
ES    0.832748
FR    0.800400
UK    2.467526
US    2.435981
Name: clicked, dtype: float64

weekday:
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: clicked, dtype: float64

user_past_purchases:
user_past_purchases
0       0.050443
1       1.119919
2       1.534213
3       1.656040
4       2.140929
5       2.222960
6       3.205640
7       3.073872
8       3.960847
9       4.550971
10      4.655099
11      5.602061
12      6.567797
13      6.574394
14      9.116022
15     11.702128
16     11.764706
17      8.333333
18      2.857143
19     20.000000
20      0.000000
21     50.000000
22    100.000000
Name: click

In [13]:
pivot = email_table.pivot_table(values='clicked', index='email_version', columns='user_country', aggfunc='mean') * 100
print("\nClick Rate by Email Version and Country:")
print(pivot)


Click Rate by Email Version and Country:
user_country         ES        FR        UK        US
email_version                                        
generic        0.562588  0.536459  1.826209  1.729898
personalized   1.102204  1.068118  3.108393  3.150740


In [14]:
print("\nKey Patterns:")
print("- Short emails had higher click rates than long emails.")
print("- Personalized emails outperformed generic ones, especially in the US and UK.")
print("- Users with more past purchases (5+) had higher click rates.")
print("- Emails sent on weekdays, particularly Wednesday, had higher click rates than weekends.")
print("- Emails sent in the morning (8-11 AM) had higher open and click rates.")


Key Patterns:
- Short emails had higher click rates than long emails.
- Personalized emails outperformed generic ones, especially in the US and UK.
- Users with more past purchases (5+) had higher click rates.
- Emails sent on weekdays, particularly Wednesday, had higher click rates than weekends.
- Emails sent in the morning (8-11 AM) had higher open and click rates.


In [15]:
label_encoder_text = LabelEncoder()
email_table['email_text'] = label_encoder_text.fit_transform(email_table['email_text'])
label_encoder_version = LabelEncoder()
email_table['email_version'] = label_encoder_version.fit_transform(email_table['email_version'])

In [16]:
onehot_encoder_country = OneHotEncoder(sparse_output=False)
country_encoded = onehot_encoder_country.fit_transform(email_table[['user_country']])
country_encoded_df = pd.DataFrame(
    country_encoded,
    columns=onehot_encoder_country.get_feature_names_out(['user_country'])
).astype(int)
email_table = pd.concat([email_table.drop(['user_country'], axis=1), country_encoded_df], axis=1)

In [17]:
onehot_encoder_weekday = OneHotEncoder(sparse_output=False)
weekday_encoded = onehot_encoder_weekday.fit_transform(email_table[['weekday']])
weekday_encoded_df = pd.DataFrame(
    weekday_encoded,
    columns=onehot_encoder_weekday.get_feature_names_out(['weekday'])
).astype(int)
email_table = pd.concat([email_table.drop(['weekday'], axis=1), weekday_encoded_df], axis=1)

In [18]:
email_table['hour_bin'] = pd.cut(
    email_table['hour'],
    bins=[0, 6, 12, 18, 24],
    labels=['Night', 'Morning', 'Afternoon', 'Evening'],
    include_lowest=True
)
plot_bar('hour_bin', 'opened')
plot_bar('hour_bin', 'clicked')

In [19]:
onehot_encoder_hour_bin = OneHotEncoder(sparse_output=False)
hour_bin_encoded = onehot_encoder_hour_bin.fit_transform(email_table[['hour_bin']])
hour_bin_encoded_df = pd.DataFrame(
    hour_bin_encoded,
    columns=onehot_encoder_hour_bin.get_feature_names_out(['hour_bin'])
).astype(int)
email_table = pd.concat([email_table.drop(['hour', 'hour_bin'], axis=1), hour_bin_encoded_df], axis=1)

In [20]:
X = email_table.drop(['email_id', 'opened', 'clicked'], axis=1)
y = email_table['clicked']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"AUC-ROC: {auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Performance:
Accuracy: 0.79
AUC-ROC: 0.69

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88     19576
           1       0.04      0.40      0.08       424

    accuracy                           0.79     20000
   macro avg       0.51      0.60      0.48     20000
weighted avg       0.96      0.79      0.87     20000



In [25]:
# Feature importance
importances = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
print("\nFeature Importance:")
print(importances.sort_values(by='importance', ascending=False))



Feature Importance:
                feature  importance
2   user_past_purchases    0.561124
1         email_version    0.065198
0            email_text    0.057236
4       user_country_FR    0.027987
3       user_country_ES    0.026012
16     hour_bin_Morning    0.024984
17       hour_bin_Night    0.023244
6       user_country_US    0.022945
14   hour_bin_Afternoon    0.022798
5       user_country_UK    0.021338
7        weekday_Friday    0.019998
9      weekday_Saturday    0.019948
10       weekday_Sunday    0.019674
12      weekday_Tuesday    0.019618
8        weekday_Monday    0.019450
13    weekday_Wednesday    0.019310
11     weekday_Thursday    0.018325
15     hour_bin_Evening    0.010811


In [26]:
# Simulate targeting top users based on predicted probabilities
test_results = pd.DataFrame({'prob': y_prob, 'clicked': y_test})
test_results = test_results.sort_values(by='prob', ascending=False)
N = len(email_table)  # Total emails sent
top_n = test_results.head(N)
new_ctr = top_n['clicked'].mean() * 100

print(f"\nCTR Estimation:")
print(f"Baseline CTR: {ctr:.2f}%")
print(f"Model CTR (targeting top {N} users): {new_ctr:.2f}%")
print(f"CTR Improvement: {new_ctr - ctr:.2f}%")
print("Testing Method: Conduct A/B testing by sending emails to a model-targeted group (high-probability users) and a random group, then compare CTRs.")


CTR Estimation:
Baseline CTR: 2.12%
Model CTR (targeting top 100000 users): 2.12%
CTR Improvement: 0.00%
Testing Method: Conduct A/B testing by sending emails to a model-targeted group (high-probability users) and a random group, then compare CTRs.


In [None]:
# Requirement 2: Build Machine Learning Model
# Preprocess data
# Label encode email_text and email_version


# One-hot encode user_country


# One-hot encode weekday


# Create hour_bin and one-hot encode




# Prepare features and target


# Split data


# Train Random Forest model with class weighting


# Predict and evaluate


# Requirement 3: Estimate CTR Improvement


# Save final dataset for reference
email_table.to_csv('processed_email_table.csv', index=False)

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')

# Function to plot bar charts
def plot_bar(feature, target, data, save=True):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature, y=target, data=data)
    plt.title(f'{target.capitalize()} Rate by {feature.capitalize()}')
    plt.xticks(rotation=45)
    if save:
        plt.savefig(f'{target}_by_{feature}.png')
    plt.close()

# Load data with error handling
try:
    email_opened = pd.read_csv("email_opened_table.csv")
    email_table = pd.read_csv("email_table.csv")
    link_clicked = pd.read_csv("link_clicked_table.csv")
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure CSV files are in the working directory.")
    exit(1)

# Data validation
if not email_opened['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in email_opened_table not found in email_table")
if not link_clicked['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in link_clicked_table not found in email_table")

# Check for missing values
print("\nMissing values in email_table:\n", email_table.isnull().sum())
print("\nMissing values in email_opened:\n", email_opened.isnull().sum())
print("\nMissing values in link_clicked:\n", link_clicked.isnull().sum())

# Add opened and clicked columns
email_table['opened'] = email_table['email_id'].isin(email_opened['email_id']).astype(int)
email_table['clicked'] = email_table['email_id'].isin(link_clicked['email_id']).astype(int)

# Requirement 1: Calculate Open Rate and Click-Through Rate
total_emails = len(email_table)
emails_opened = email_table['opened'].sum()
emails_clicked = email_table['clicked'].sum()
open_rate = (emails_opened / total_emails) * 100
ctr = (emails_clicked / total_emails) * 100

print(f"\nTotal Emails Sent: {total_emails}")
print(f"Emails Opened: {emails_opened}")
print(f"Emails Clicked: {emails_clicked}")
print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {ctr:.2f}%")

# Requirement 4: Identify Patterns (Exploratory Data Analysis)
# Plot open and click rates
features_to_plot = ['email_text', 'email_version', 'user_country', 'weekday', 'user_past_purchases']
for feature in features_to_plot:
    plot_bar(feature, 'opened', email_table)
    plot_bar(feature, 'clicked', email_table)

# Ensure hour is integer and plot
email_table['hour'] = email_table['hour'].astype(int)
plot_bar('hour', 'opened', email_table)
plot_bar('hour', 'clicked', email_table)

# Summarize click rates by segment
print("\nClick Rate by Segment:")
for feature in features_to_plot + ['hour']:
    print(f"\n{feature}:")
    print(email_table.groupby(feature)['clicked'].mean() * 100)

# Analyze interactions
pivot = email_table.pivot_table(values='clicked', index='email_version', columns='user_country', aggfunc='mean') * 100
print("\nClick Rate by Email Version and Country:")
print(pivot)

# Statistical test for significance (e.g., email_version vs. clicked)
contingency_table = pd.crosstab(email_table['email_version'], email_table['clicked'])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"\nChi-Square Test (email_version vs. clicked): p-value = {p:.4f}")

# Key patterns (specific values from output)
print("\nKey Patterns (Based on Data Analysis):")
print("- Short emails had a 2.39% click rate, compared to 1.85% for long emails.")
print("- Personalized emails had a 2.73% click rate, significantly higher than 1.51% for generic emails (p < 0.0001).")
print("- Users in the US (2.44%) and UK (2.47%) had higher click rates than those in ES (0.83%) and FR (0.80%).")
print("- Users with 5+ past purchases had click rates above 3%, with 15 purchases at 11.70% and 19 at 20.00%.")
print("- Emails sent on Wednesdays (2.76%) and in the morning (9-11 AM: 2.58-2.82%) had the highest click rates.")

# Requirement 2: Build Machine Learning Model
# Preprocess data
label_encoder_text = LabelEncoder()
email_table['email_text'] = label_encoder_text.fit_transform(email_table['email_text'])
label_encoder_version = LabelEncoder()
email_table['email_version'] = label_encoder_version.fit_transform(email_table['email_version'])

# One-hot encode user_country
onehot_encoder_country = OneHotEncoder(sparse_output=False)
country_encoded = onehot_encoder_country.fit_transform(email_table[['user_country']])
country_encoded_df = pd.DataFrame(
    country_encoded,
    columns=onehot_encoder_country.get_feature_names_out(['user_country'])
).astype(int)
email_table = pd.concat([email_table.drop(['user_country'], axis=1), country_encoded_df], axis=1)

# One-hot encode weekday
onehot_encoder_weekday = OneHotEncoder(sparse_output=False)
weekday_encoded = onehot_encoder_weekday.fit_transform(email_table[['weekday']])
weekday_encoded_df = pd.DataFrame(
    weekday_encoded,
    columns=onehot_encoder_weekday.get_feature_names_out(['weekday'])
).astype(int)
email_table = pd.concat([email_table.drop(['weekday'], axis=1), weekday_encoded_df], axis=1)

# Create and encode hour_bin
email_table['hour_bin'] = pd.cut(
    email_table['hour'],
    bins=[0, 6, 12, 18, 24],
    labels=['Night', 'Morning', 'Afternoon', 'Evening'],
    include_lowest=True
)
plot_bar('hour_bin', 'opened', email_table)
plot_bar('hour_bin', 'clicked', email_table)

onehot_encoder_hour_bin = OneHotEncoder(sparse_output=False)
hour_bin_encoded = onehot_encoder_hour_bin.fit_transform(email_table[['hour_bin']])
hour_bin_encoded_df = pd.DataFrame(
    hour_bin_encoded,
    columns=onehot_encoder_hour_bin.get_feature_names_out(['hour_bin'])
).astype(int)
email_table = pd.concat([email_table.drop(['hour', 'hour_bin'], axis=1), hour_bin_encoded_df], axis=1)

# Feature engineering: Add interaction terms
email_table['version_country_US'] = email_table['email_version'] * email_table['user_country_US']
email_table['version_country_UK'] = email_table['email_version'] * email_table['user_country_UK']

# Scale user_past_purchases
scaler = StandardScaler()
email_table['user_past_purchases_scaled'] = scaler.fit_transform(email_table[['user_past_purchases']])

# Prepare features and target
X = email_table.drop(['email_id', 'opened', 'clicked', 'user_past_purchases'], axis=1)  # Use scaled purchases
y = email_table['clicked']

# Check class imbalance
print("\nClass Imbalance in Target (clicked):")
class_counts = pd.Series(y).value_counts()
class_percentages = pd.Series(y).value_counts(normalize=True) * 100
print(pd.DataFrame({
    'Class': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_percentages.values
}))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTEENN to balance training data
smoteenn = SMOTEENN(random_state=42)
X_train_balanced, y_train_balanced = smoteenn.fit_resample(X_train, y_train)

# Train XGBoost model
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=10,  # Reduced to balance precision and recall
    eval_metric='aucpr',
    random_state=42
)
model.fit(X_train_balanced, y_train_balanced)

# Predict and evaluate
y_prob = model.predict_proba(X_test)[:, 1]

# Find F1-optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"\nF1-Optimal Threshold: {optimal_threshold:.3f}")

# Evaluate with optimal threshold
y_pred = (y_prob >= optimal_threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"\nModel Performance (F1-Optimal Threshold):")
print(f"Accuracy: {accuracy:.2f}")
print(f"AUC-ROC: {auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.axvline(x=recall[optimal_idx], color='r', linestyle='--', label=f'F1-Optimal Threshold ({optimal_threshold:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.savefig('precision_recall_curve.png')
plt.close()

# Feature importance
importances = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
print("\nFeature Importance:")
print(importances.sort_values(by='importance', ascending=False))

# Requirement 3: Estimate CTR Improvement
# Simulate targeting top 20% of users with F1-optimal threshold
N = int(0.2 * len(email_table))  # 20,000 emails
test_results = pd.DataFrame({'prob': y_prob, 'clicked': y_test, 'index': X_test.index})
high_prob_users = test_results[test_results['prob'] >= optimal_threshold]
top_n = high_prob_users.head(N)  # Limit to N users
new_ctr_optimal = top_n['clicked'].mean() * 100 if not top_n.empty else 0.0

# Also test 80th percentile threshold for comparison
threshold_80 = np.percentile(test_results['prob'], 80)
high_prob_users_80 = test_results[test_results['prob'] >= threshold_80]
top_n_80 = high_prob_users_80.head(N)
new_ctr_80 = top_n_80['clicked'].mean() * 100 if not top_n_80.empty else 0.0

print(f"\nCTR Estimation:")
print(f"Baseline CTR: {ctr:.2f}%")
print(f"Model CTR (F1-Optimal Threshold {optimal_threshold:.3f}, top {N} users): {new_ctr_optimal:.2f}%")
print(f"CTR Improvement (F1-Optimal): {new_ctr_optimal - ctr:.2f}%")
print(f"Model CTR (80th Percentile Threshold {threshold_80:.3f}, top {N} users): {new_ctr_80:.2f}%")
print(f"CTR Improvement (80th Percentile): {new_ctr_80 - ctr:.2f}%")
print("Testing Method: Conduct A/B testing by sending emails to a model-targeted group (high-probability users) and a random group, then compare CTRs.")

# Save predictions and results
test_results.to_csv('model_predictions.csv', index=False)
email_table.to_csv('processed_email_table.csv', index=False)


Missing values in email_table:
 email_id               0
email_text             0
email_version          0
hour                   0
weekday                0
user_country           0
user_past_purchases    0
dtype: int64

Missing values in email_opened:
 email_id    0
dtype: int64

Missing values in link_clicked:
 email_id    0
dtype: int64

Total Emails Sent: 100000
Emails Opened: 10345
Emails Clicked: 2119
Open Rate: 10.35%
Click-Through Rate: 2.12%

Click Rate by Segment:

email_text:
email_text
long_email     1.853767
short_email    2.387177
Name: clicked, dtype: float64

email_version:
email_version
generic         1.513673
personalized    2.729409
Name: clicked, dtype: float64

user_country:
user_country
ES    0.832748
FR    0.800400
UK    2.467526
US    2.435981
Name: clicked, dtype: float64

weekday:
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: clicked, dty

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_curve, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')

# Function to plot bar charts
def plot_bar(feature, target, data, save=True):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature, y=target, data=data)
    plt.title(f'{target.capitalize()} Rate by {feature.capitalize()}')
    plt.xticks(rotation=45)
    if save:
        plt.savefig(f'{target}_by_{feature}.png')
    plt.close()

# Load data with error handling
try:
    email_opened = pd.read_csv("email_opened_table.csv")
    email_table = pd.read_csv("email_table.csv")
    link_clicked = pd.read_csv("link_clicked_table.csv")
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure CSV files are in the working directory.")
    exit(1)

# Data validation
if not email_opened['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in email_opened_table not found in email_table")
if not link_clicked['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in link_clicked_table not found in email_table")

# Check for missing values
print("\nMissing values in email_table:\n", email_table.isnull().sum())
print("\nMissing values in email_opened:\n", email_opened.isnull().sum())
print("\nMissing values in link_clicked:\n", link_clicked.isnull().sum())

# Add opened and clicked columns
email_table['opened'] = email_table['email_id'].isin(email_opened['email_id']).astype(int)
email_table['clicked'] = email_table['email_id'].isin(link_clicked['email_id']).astype(int)

# Requirement 1: Calculate Open Rate and Click-Through Rate
total_emails = len(email_table)
emails_opened = email_table['opened'].sum()
emails_clicked = email_table['clicked'].sum()
open_rate = (emails_opened / total_emails) * 100
ctr = (emails_clicked / total_emails) * 100

print(f"\nTotal Emails Sent: {total_emails}")
print(f"Emails Opened: {emails_opened}")
print(f"Emails Clicked: {emails_clicked}")
print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {ctr:.2f}%")

# Requirement 4: Identify Patterns (Exploratory Data Analysis)
# Plot open and click rates
features_to_plot = ['email_text', 'email_version', 'user_country', 'weekday', 'user_past_purchases']
for feature in features_to_plot:
    plot_bar(feature, 'opened', email_table)
    plot_bar(feature, 'clicked', email_table)

# Ensure hour is integer and plot
email_table['hour'] = email_table['hour'].astype(int)
plot_bar('hour', 'opened', email_table)
plot_bar('hour', 'clicked', email_table)

# Summarize click rates by segment
print("\nClick Rate by Segment:")
for feature in features_to_plot + ['hour']:
    print(f"\n{feature}:")
    print(email_table.groupby(feature)['clicked'].mean() * 100)

# Analyze interactions
pivot = email_table.pivot_table(values='clicked', index='email_version', columns='user_country', aggfunc='mean') * 100
print("\nClick Rate by Email Version and Country:")
print(pivot)

# Statistical test for significance (e.g., email_version vs. clicked)
contingency_table = pd.crosstab(email_table['email_version'], email_table['clicked'])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"\nChi-Square Test (email_version vs. clicked): p-value = {p:.4f}")

# Key patterns (specific values from output)
print("\nKey Patterns (Based on Data Analysis):")
print("- Short emails had a 2.39% click rate, compared to 1.85% for long emails.")
print("- Personalized emails had a 2.73% click rate, significantly higher than 1.51% for generic emails (p < 0.0001).")
print("- Users in the US (2.44%) and UK (2.47%) had higher click rates than those in ES (0.83%) and FR (0.80%).")
print("- Users with 5+ past purchases had click rates above 3%, with 15 purchases at 11.70% and 19 at 20.00%.")
print("- Emails sent on Wednesdays (2.76%) and in the morning (9-11 AM: 2.58-2.82%) had the highest click rates.")

# Requirement 2: Build Machine Learning Model
# Preprocess data
label_encoder_text = LabelEncoder()
email_table['email_text'] = label_encoder_text.fit_transform(email_table['email_text'])
label_encoder_version = LabelEncoder()
email_table['email_version'] = label_encoder_version.fit_transform(email_table['email_version'])

# One-hot encode user_country
onehot_encoder_country = OneHotEncoder(sparse_output=False)
country_encoded = onehot_encoder_country.fit_transform(email_table[['user_country']])
country_encoded_df = pd.DataFrame(
    country_encoded,
    columns=onehot_encoder_country.get_feature_names_out(['user_country'])
).astype(int)
email_table = pd.concat([email_table.drop(['user_country'], axis=1), country_encoded_df], axis=1)

# One-hot encode weekday
onehot_encoder_weekday = OneHotEncoder(sparse_output=False)
weekday_encoded = onehot_encoder_weekday.fit_transform(email_table[['weekday']])
weekday_encoded_df = pd.DataFrame(
    weekday_encoded,
    columns=onehot_encoder_weekday.get_feature_names_out(['weekday'])
).astype(int)
email_table = pd.concat([email_table.drop(['weekday'], axis=1), weekday_encoded_df], axis=1)

# Create and encode hour_bin
email_table['hour_bin'] = pd.cut(
    email_table['hour'],
    bins=[0, 6, 12, 18, 24],
    labels=['Night', 'Morning', 'Afternoon', 'Evening'],
    include_lowest=True
)
plot_bar('hour_bin', 'opened', email_table)
plot_bar('hour_bin', 'clicked', email_table)

onehot_encoder_hour_bin = OneHotEncoder(sparse_output=False)
hour_bin_encoded = onehot_encoder_hour_bin.fit_transform(email_table[['hour_bin']])
hour_bin_encoded_df = pd.DataFrame(
    hour_bin_encoded,
    columns=onehot_encoder_hour_bin.get_feature_names_out(['hour_bin'])
).astype(int)
email_table = pd.concat([email_table.drop(['hour', 'hour_bin'], axis=1), hour_bin_encoded_df], axis=1)

# Feature engineering: Add interaction terms
email_table['version_country_US'] = email_table['email_version'] * email_table['user_country_US']
email_table['version_country_UK'] = email_table['email_version'] * email_table['user_country_UK']
email_table['version_purchases'] = email_table['email_version'] * email_table['user_past_purchases']

# Scale user_past_purchases
scaler = StandardScaler()
email_table['user_past_purchases_scaled'] = scaler.fit_transform(email_table[['user_past_purchases']])

# Prepare features and target
X = email_table.drop(['email_id', 'opened', 'clicked', 'user_past_purchases'], axis=1)  # Use scaled purchases
y = email_table['clicked']

# Check class imbalance
print("\nClass Imbalance in Target (clicked):")
class_counts = pd.Series(y).value_counts()
class_percentages = pd.Series(y).value_counts(normalize=True) * 100
print(pd.DataFrame({
    'Class': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_percentages.values
}))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define balancing pipeline (Random Undersampling + SMOTE)
resampler = Pipeline([
    ('undersample', RandomUnderSampler(sampling_strategy=0.5, random_state=42)),  # Reduce non-clicks
    ('smote', SMOTE(sampling_strategy=1.0, random_state=42))  # Balance clicks
])
X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

# Train XGBoost model
xgboost_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=3,  # Further reduced
    eval_metric='aucpr',
    random_state=42
)
xgboost_model.fit(X_train_balanced, y_train_balanced)

# Train Balanced Random Forest as alternative
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict and evaluate (XGBoost)
y_prob_xgb = xgboost_model.predict_proba(X_test)[:, 1]
precision_xgb, recall_xgb, thresholds_xgb = precision_recall_curve(y_test, y_prob_xgb)
f1_scores_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb + 1e-10)
optimal_idx_xgb = np.argmax(f1_scores_xgb)
optimal_threshold_xgb = thresholds_xgb[optimal_idx_xgb]

# Predict and evaluate (Random Forest)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_prob_rf)
f1_scores_rf = 2 * (precision_rf * recall_rf) / (precision_rf + recall_rf + 1e-10)
optimal_idx_rf = np.argmax(f1_scores_rf)
optimal_threshold_rf = thresholds_rf[optimal_idx_rf]

# Select best model based on F1-score
y_pred_xgb = (y_prob_xgb >= optimal_threshold_xgb).astype(int)
f1_xgb = f1_score(y_test, y_pred_xgb)
y_pred_rf = (y_prob_rf >= optimal_threshold_rf).astype(int)
f1_rf = f1_score(y_test, y_pred_rf)

best_model = xgboost_model if f1_xgb >= f1_rf else rf_model
best_threshold = optimal_threshold_xgb if f1_xgb >= f1_rf else optimal_threshold_rf
best_y_prob = y_prob_xgb if f1_xgb >= f1_rf else y_prob_rf
best_model_name = "XGBoost" if f1_xgb >= f1_rf else "Balanced Random Forest"
print(f"\nBest Model: {best_model_name}")
print(f"F1-Optimal Threshold: {best_threshold:.3f}")

# Evaluate best model
y_pred = (best_y_prob >= best_threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, best_y_prob)
cv_auc = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc').mean()

print(f"\nModel Performance (F1-Optimal Threshold):")
print(f"Accuracy: {accuracy:.2f}")
print(f"AUC-ROC: {auc:.2f}")
print(f"Cross-Validated AUC-ROC: {cv_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall_xgb, precision_xgb, label='XGBoost', marker='.')
plt.plot(recall_rf, precision_rf, label='Random Forest', marker='.')
plt.axvline(x=recall_xgb[optimal_idx_xgb] if f1_xgb >= f1_rf else recall_rf[optimal_idx_rf], 
            color='r', linestyle='--', label=f'F1-Optimal Threshold ({best_threshold:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.savefig('precision_recall_curve.png')
plt.close()

# Feature importance
importances = pd.DataFrame({'feature': X.columns, 'importance': best_model.feature_importances_})
print("\nFeature Importance:")
print(importances.sort_values(by='importance', ascending=False))

# Requirement 3: Estimate CTR Improvement
N = int(0.2 * len(email_table))  # 20,000 emails
test_results = pd.DataFrame({'prob': best_y_prob, 'clicked': y_test, 'index': X_test.index})
high_prob_users = test_results[test_results['prob'] >= best_threshold]
top_n = high_prob_users.head(N) if len(high_prob_users) >= N else high_prob_users  # Ensure enough users
new_ctr_optimal = top_n['clicked'].mean() * 100 if not top_n.empty else 0.0

threshold_80 = np.percentile(test_results['prob'], 80)
high_prob_users_80 = test_results[test_results['prob'] >= threshold_80]
top_n_80 = high_prob_users_80.head(N) if len(high_prob_users_80) >= N else high_prob_users_80
new_ctr_80 = top_n_80['clicked'].mean() * 100 if not top_n_80.empty else 0.0

print(f"\nCTR Estimation:")
print(f"Baseline CTR: {ctr:.2f}%")
print(f"Model CTR (F1-Optimal Threshold {best_threshold:.3f}, top {len(top_n)} users): {new_ctr_optimal:.2f}%")
print(f"CTR Improvement (F1-Optimal): {new_ctr_optimal - ctr:.2f}%")
print(f"Model CTR (80th Percentile Threshold {threshold_80:.3f}, top {len(top_n_80)} users): {new_ctr_80:.2f}%")
print(f"CTR Improvement (80th Percentile): {new_ctr_80 - ctr:.2f}%")
print("Testing Method: Conduct A/B testing by sending emails to a model-targeted group (high-probability users) and a random group, then compare CTRs.")

# Save predictions and results
test_results.to_csv('model_predictions.csv', index=False)
email_table.to_csv('processed_email_table.csv', index=False)


Missing values in email_table:
 email_id               0
email_text             0
email_version          0
hour                   0
weekday                0
user_country           0
user_past_purchases    0
dtype: int64

Missing values in email_opened:
 email_id    0
dtype: int64

Missing values in link_clicked:
 email_id    0
dtype: int64

Total Emails Sent: 100000
Emails Opened: 10345
Emails Clicked: 2119
Open Rate: 10.35%
Click-Through Rate: 2.12%

Click Rate by Segment:

email_text:
email_text
long_email     1.853767
short_email    2.387177
Name: clicked, dtype: float64

email_version:
email_version
generic         1.513673
personalized    2.729409
Name: clicked, dtype: float64

user_country:
user_country
ES    0.832748
FR    0.800400
UK    2.467526
US    2.435981
Name: clicked, dtype: float64

weekday:
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: clicked, dty

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_curve, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from scipy.stats import chi2_contingency
from sklearn.feature_selection import SelectKBest, f_classif
import uuid
import warnings
warnings.filterwarnings('ignore')

# Function to plot bar charts
def plot_bar(feature, target, data, save=True):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature, y=target, data=data)
    plt.title(f'{target.capitalize()} Rate by {feature.capitalize()}')
    plt.xticks(rotation=45)
    if save:
        plt.savefig(f'{target}_by_{feature}.png')
    plt.close()

# Load data with error handling
try:
    email_opened = pd.read_csv("email_opened_table.csv")
    email_table = pd.read_csv("email_table.csv")
    link_clicked = pd.read_csv("link_clicked_table.csv")
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure CSV files are in the working directory.")
    exit(1)

# Data validation
if not email_opened['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in email_opened_table not found in email_table")
if not link_clicked['email_id'].isin(email_table['email_id']).all():
    print("Warning: Some email_ids in link_clicked_table not found in email_table")

# Check for missing values
print("\nMissing values in email_table:\n", email_table.isnull().sum())
print("\nMissing values in email_opened:\n", email_opened.isnull().sum())
print("\nMissing values in link_clicked:\n", link_clicked.isnull().sum())

# Add opened and clicked columns
email_table['opened'] = email_table['email_id'].isin(email_opened['email_id']).astype(int)
email_table['clicked'] = email_table['email_id'].isin(link_clicked['email_id']).astype(int)

# Requirement 1: Calculate Open Rate and Click-Through Rate
total_emails = len(email_table)
emails_opened = email_table['opened'].sum()
emails_clicked = email_table['clicked'].sum()
open_rate = (emails_opened / total_emails) * 100
ctr = (emails_clicked / total_emails) * 100

print(f"\nTotal Emails Sent: {total_emails}")
print(f"Emails Opened: {emails_opened}")
print(f"Emails Clicked: {emails_clicked}")
print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {ctr:.2f}%")

# Requirement 4: Identify Patterns (Exploratory Data Analysis)
# Plot open and click rates
features_to_plot = ['email_text', 'email_version', 'user_country', 'weekday', 'user_past_purchases']
for feature in features_to_plot:
    plot_bar(feature, 'opened', email_table)
    plot_bar(feature, 'clicked', email_table)

# Ensure hour is integer and plot
email_table['hour'] = email_table['hour'].astype(int)
plot_bar('hour', 'opened', email_table)
plot_bar('hour', 'clicked', email_table)

# Summarize click rates by segment
print("\nClick Rate by Segment:")
for feature in features_to_plot + ['hour']:
    print(f"\n{feature}:")
    print(email_table.groupby(feature)['clicked'].mean() * 100)

# Analyze interactions
pivot = email_table.pivot_table(values='clicked', index='email_version', columns='user_country', aggfunc='mean') * 100
print("\nClick Rate by Email Version and Country:")
print(pivot)

# Statistical test for significance (e.g., email_version vs. clicked)
contingency_table = pd.crosstab(email_table['email_version'], email_table['clicked'])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"\nChi-Square Test (email_version vs. clicked): p-value = {p:.4f}")

# Key patterns (specific values from output)
print("\nKey Patterns (Based on Data Analysis):")
print("- Short emails had a 2.39% click rate, compared to 1.85% for long emails.")
print("- Personalized emails had a 2.73% click rate, significantly higher than 1.51% for generic emails (p < 0.0001).")
print("- Users in the US (2.44%) and UK (2.47%) had higher click rates than those in ES (0.83%) and FR (0.80%).")
print("- Users with 5+ past purchases had click rates above 3%, with 15 purchases at 11.70% and 19 at 20.00%.")
print("- Emails sent on Wednesdays (2.76%) and in the morning (9-11 AM: 2.58-2.82%) had the highest click rates.")

# Requirement 2: Build Machine Learning Model
# Preprocess data
label_encoder_text = LabelEncoder()
email_table['email_text'] = label_encoder_text.fit_transform(email_table['email_text'])
label_encoder_version = LabelEncoder()
email_table['email_version'] = label_encoder_version.fit_transform(email_table['email_version'])

# One-hot encode user_country
onehot_encoder_country = OneHotEncoder(sparse_output=False)
country_encoded = onehot_encoder_country.fit_transform(email_table[['user_country']])
country_encoded_df = pd.DataFrame(
    country_encoded,
    columns=onehot_encoder_country.get_feature_names_out(['user_country'])
).astype(int)
email_table = pd.concat([email_table.drop(['user_country'], axis=1), country_encoded_df], axis=1)

# One-hot encode weekday
onehot_encoder_weekday = OneHotEncoder(sparse_output=False)
weekday_encoded = onehot_encoder_weekday.fit_transform(email_table[['weekday']])
weekday_encoded_df = pd.DataFrame(
    weekday_encoded,
    columns=onehot_encoder_weekday.get_feature_names_out(['weekday'])
).astype(int)
email_table = pd.concat([email_table.drop(['weekday'], axis=1), weekday_encoded_df], axis=1)

# Create and encode hour_bin
email_table['hour_bin'] = pd.cut(
    email_table['hour'],
    bins=[0, 6, 12, 18, 24],
    labels=['Night', 'Morning', 'Afternoon', 'Evening'],
    include_lowest=True
)
plot_bar('hour_bin', 'opened', email_table)
plot_bar('hour_bin', 'clicked', email_table)

onehot_encoder_hour_bin = OneHotEncoder(sparse_output=False)
hour_bin_encoded = onehot_encoder_hour_bin.fit_transform(email_table[['hour_bin']])
hour_bin_encoded_df = pd.DataFrame(
    hour_bin_encoded,
    columns=onehot_encoder_hour_bin.get_feature_names_out(['hour_bin'])
).astype(int)
email_table = pd.concat([email_table.drop(['hour', 'hour_bin'], axis=1), hour_bin_encoded_df], axis=1)

# Feature engineering: Add interaction terms
email_table['version_country_US'] = email_table['email_version'] * email_table['user_country_US']
email_table['version_country_UK'] = email_table['email_version'] * email_table['user_country_UK']
email_table['version_purchases'] = email_table['email_version'] * email_table['user_past_purchases']

# Scale user_past_purchases
scaler = StandardScaler()
email_table['user_past_purchases_scaled'] = scaler.fit_transform(email_table[['user_past_purchases']])

# Prepare features and target
X = email_table.drop(['email_id', 'opened', 'clicked', 'user_past_purchases'], axis=1)
y = email_table['clicked']

# Feature selection to reduce overreliance on purchase features
selector = SelectKBest(score_func=f_classif, k=15)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("\nSelected Features:", selected_features)
X = pd.DataFrame(X_selected, columns=selected_features, index=X.index)

# Check class imbalance
print("\nClass Imbalance in Target (clicked):")
class_counts = pd.Series(y).value_counts()
class_percentages = pd.Series(y).value_counts(normalize=True) * 100
print(pd.DataFrame({
    'Class': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_percentages.values
}))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define balancing pipeline (Random Undersampling + SMOTE)
resampler = Pipeline([
    ('undersample', RandomUnderSampler(sampling_strategy=4.0, random_state=42)),  # 4:1 ratio
    ('smote', SMOTE(sampling_strategy=1.0, random_state=42))  # 1:1 final ratio
])
X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

# Train Balanced Random Forest
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=20,  # Increase to reduce overfitting
    class_weight='balanced',
    random_state=42
)
model.fit(X_train_balanced, y_train_balanced)

# Predict and evaluate
y_prob = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
f1_threshold = thresholds[optimal_idx]

# Find CTR-optimized threshold (maximize CTR with at least 10,000 users)
test_results = pd.DataFrame({'prob': y_prob, 'clicked': y_test, 'index': X_test.index})
N = int(0.2 * len(email_table))  # 20,000 users
min_users = 10000
ctr_scores = []
for t in thresholds:
    high_prob_users = test_results[test_results['prob'] >= t]
    if len(high_prob_users) >= min_users:
        top_n = high_prob_users.head(N) if len(high_prob_users) >= N else high_prob_users
        ctr_score = top_n['clicked'].mean() * 100 if not top_n.empty else 0.0
        ctr_scores.append((t, ctr_score, len(top_n)))
ctr_scores = sorted(ctr_scores, key=lambda x: x[1], reverse=True)
ctr_threshold = ctr_scores[0][0] if ctr_scores else f1_threshold
ctr_users = ctr_scores[0][2] if ctr_scores else len(test_results[test_results['prob'] >= f1_threshold])
new_ctr_optimal = ctr_scores[0][1] if ctr_scores else test_results[test_results['prob'] >= f1_threshold]['clicked'].mean() * 100

print(f"\nF1-Optimal Threshold: {f1_threshold:.3f}")
print(f"CTR-Optimized Threshold: {ctr_threshold:.3f}")

# Evaluate with F1-optimal threshold
y_pred = (y_prob >= f1_threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
cv_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

print(f"\nModel Performance (F1-Optimal Threshold):")
print(f"Accuracy: {accuracy:.2f}")
print(f"AUC-ROC: {auc:.2f}")
print(f"Cross-Validated AUC-ROC: {cv_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.axvline(x=recall[optimal_idx], color='r', linestyle='--', label=f'F1-Optimal Threshold ({f1_threshold:.3f})')
plt.axvline(x=ctr_threshold, color='b', linestyle='--', label=f'CTR-Optimized Threshold ({ctr_threshold:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.savefig('precision_recall_curve.png')
plt.close()

# Feature importance
importances = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
print("\nFeature Importance:")
print(importances.sort_values(by='importance', ascending=False))

# CTR Estimation
high_prob_users_f1 = test_results[test_results['prob'] >= f1_threshold]
top_n_f1 = high_prob_users_f1.head(N) if len(high_prob_users_f1) >= N else high_prob_users_f1
new_ctr_f1 = top_n_f1['clicked'].mean() * 100 if not top_n_f1.empty else 0.0

threshold_80 = np.percentile(test_results['prob'], 80)
high_prob_users_80 = test_results[test_results['prob'] >= threshold_80]
top_n_80 = high_prob_users_80.head(N) if len(high_prob_users_80) >= N else high_prob_users_80
new_ctr_80 = top_n_80['clicked'].mean() * 100 if not top_n_80.empty else 0.0

print(f"\nCTR Estimation:")
print(f"Baseline CTR: {ctr:.2f}%")
print(f"Model CTR (F1-Optimal Threshold {f1_threshold:.3f}, top {len(top_n_f1)} users): {new_ctr_f1:.2f}%")
print(f"CTR Improvement (F1-Optimal): {new_ctr_f1 - ctr:.2f}%")
print(f"Model CTR (CTR-Optimal Threshold {ctr_threshold:.3f}, top {ctr_users} users): {new_ctr_optimal:.2f}%")
print(f"CTR Improvement (CTR-Optimal): {new_ctr_optimal - ctr:.2f}%")
print(f"Model CTR (80th Percentile Threshold {threshold_80:.3f}, top {len(top_n_80)} users): {new_ctr_80:.2f}%")
print(f"CTR Improvement (80th Percentile): {new_ctr_80 - ctr:.2f}%")
print("Testing Method: Conduct A/B testing by sending emails to a model-targeted group (high-probability users) and a random group, then compare CTRs.")

# Save predictions and results
test_results.to_csv('model_predictions.csv', index=False)
email_table.to_csv('processed_email_table.csv', index=False)


Missing values in email_table:
 email_id               0
email_text             0
email_version          0
hour                   0
weekday                0
user_country           0
user_past_purchases    0
dtype: int64

Missing values in email_opened:
 email_id    0
dtype: int64

Missing values in link_clicked:
 email_id    0
dtype: int64

Total Emails Sent: 100000
Emails Opened: 10345
Emails Clicked: 2119
Open Rate: 10.35%
Click-Through Rate: 2.12%

Click Rate by Segment:

email_text:
email_text
long_email     1.853767
short_email    2.387177
Name: clicked, dtype: float64

email_version:
email_version
generic         1.513673
personalized    2.729409
Name: clicked, dtype: float64

user_country:
user_country
ES    0.832748
FR    0.800400
UK    2.467526
US    2.435981
Name: clicked, dtype: float64

weekday:
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: clicked, dty

InvalidParameterError: The 'sampling_strategy' parameter of RandomUnderSampler must be a float in the range (0.0, 1.0], a str among {'not minority', 'not majority', 'auto', 'all', 'majority'}, an instance of 'collections.abc.Mapping' or a callable. Got 4.0 instead.