In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib.ticker import PercentFormatter
import seaborn as sns 

In [None]:
email = pd.read_csv("email_table.csv")
opened = pd.read_csv("email_opened_table.csv")
link = pd.read_csv("link_clicked_table.csv")

In [None]:
email.sample(3)

In [None]:
email.info()

In [None]:
opened.sample(2)

In [None]:
opened.info()

In [None]:
link.sample(2)

In [None]:
link.info()

In [None]:
email['size'] = 1
email['email_opened'] = np.where(email['email_id'].isin(opened["email_id"]),1,0)
email['clicked_link'] = np.where(email['email_id'].isin(link["email_id"]),1,0)

In [None]:
email.sample(3)

In [None]:
email.info()

In [None]:
email.describe()

In [None]:
for col in email.columns:
    uniques = email[col].unique()
    print(f"{col:<20}{len(uniques):<10}{uniques[:5]}")

In [None]:
df = email.copy()

In [None]:
df.email_text.value_counts(normalize=True)

In [None]:
df.columns

In [None]:
grp_open_rate = df['email_opened'].mean()
grp_link_rate = df['clicked_link'].mean()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
sns.countplot(x='email_text', data=df, ax=ax[0])
ax[0].set_xlabel('Email Text')
ax[0].set_ylabel('Count')
ax[0].set_title('Count Plot of Email Text')

sns.barplot(x='email_text', y='email_opened', data=df, ax=ax[1])
ax[1].set_xlabel('Email Text')
ax[1].set_ylabel('Email Open Rate')
ax[1].set_title('Email Open Rate vs. Email Text')


sns.barplot(x='email_text', y='clicked_link', data=df, ax=ax[2])
ax[2].set_xlabel('Email Text')
ax[2].set_ylabel('Click Through Rate')
ax[2].set_title('Click Through Rate vs. Email Text')


plt.tight_layout()
plt.show()

- Short emails tend to have a higher open rate and a significantly higher click-through rate compared to long emails.
- Possible Reason: Users might prefer concise content that can be quickly consumed. Long emails may be perceived as overwhelming or time-consuming to read, leading to lower engagement.

In [None]:
data = (
    df.groupby(["email_text"])[["size", "email_opened", "clicked_link"]]
    .mean()
    .reset_index()
    .melt(
        id_vars="email_text",
        value_vars=["size", "email_opened", "clicked_link"],
        var_name="funnel_step",
        value_name="conversion_rate",
    )
)

fig, ax = plt.subplots(figsize=(20, 4))
ax = sns.barplot(data=data, x="funnel_step", y="conversion_rate", hue="email_text")
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))

for p in ax.patches:
    ax.text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 0.02,
        f"{p.get_height()*100:.2f}%",
        ha="center",
        va="bottom",
    )

ax.set_title("Conversion Rates", fontsize=16)
ax.set_ylabel("Conversion Rate (%)", fontsize=14)
ax.set_xlabel("")
ax.set_ylim(0, 1.1)
plt.tight_layout()
plt.show()


In [None]:
col_to_plot='hour'
data = df.groupby([col_to_plot,'email_text']).agg({'size':'sum','email_opened':'mean', 'clicked_link':'mean'}).reset_index()

fig, ax = plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot, y='size', hue="email_text", ax=ax[0])
ax[0].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[0].set_ylabel("Size")

sns.barplot(data=data, x=col_to_plot, y='email_opened', hue="email_text", ax=ax[1])
ax[1].axhline(y=grp_open_rate,color='r',linestyle='--')
ax[1].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[1].set_ylabel("Email Open Rate")
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))

sns.barplot(data=data, x=col_to_plot, y='clicked_link', hue="email_text", ax=ax[2])
ax[2].axhline(y=grp_link_rate,color='r',linestyle='--')
ax[2].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[2].set_ylabel("Link Click Rate")
ax[2].yaxis.set_major_formatter(PercentFormatter(xmax=1))


plt.tight_layout()
plt.show()

In [None]:
col_to_plot='email_version'
data = df.groupby([col_to_plot,'email_text']).agg({'size':'sum','email_opened':'mean', 'clicked_link':'mean'}).reset_index()

fig, ax = plt.subplots(1,3, figsize=(20,4))
sns.barplot(data=data, x=col_to_plot, y='size', hue="email_text", ax=ax[0])
ax[0].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[0].set_ylabel("Size")

sns.barplot(data=data, x=col_to_plot, y='email_opened', hue="email_text", ax=ax[1])
ax[1].axhline(y=grp_open_rate,color='r',linestyle='--')
ax[1].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[1].set_ylabel("Email Open Rate")
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))

sns.barplot(data=data, x=col_to_plot, y='clicked_link', hue="email_text", ax=ax[2])
ax[2].axhline(y=grp_link_rate,color='r',linestyle='--')
ax[2].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[2].set_ylabel("Link Click Rate")
ax[2].yaxis.set_major_formatter(PercentFormatter(xmax=1))


plt.tight_layout()
plt.show()

In [None]:
col_to_plot='weekday'
data = df.groupby([col_to_plot,'email_text']).agg({'size':'sum','email_opened':'mean', 'clicked_link':'mean'}).reset_index()

fig, ax = plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot, y='size', hue="email_text", ax=ax[0])
ax[0].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[0].set_ylabel("Size")

sns.barplot(data=data, x=col_to_plot, y='email_opened', hue="email_text", ax=ax[1])
ax[1].axhline(y=grp_open_rate,color='r',linestyle='--')
ax[1].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[1].set_ylabel("Email Open Rate")
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))

sns.barplot(data=data, x=col_to_plot, y='clicked_link', hue="email_text", ax=ax[2])
ax[2].axhline(y=grp_link_rate,color='r',linestyle='--')
ax[2].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[2].set_ylabel("Link Click Rate")
ax[2].yaxis.set_major_formatter(PercentFormatter(xmax=1))


plt.tight_layout()
plt.show()

- Wednesday and Thursday seem to be the best days for both opening the email and clicking on the link. This suggests that emails sent on these days might be more effective in reaching users.
- Friday shows the lowest engagement, which could be due to the fact that users are winding down for the weekend and less likely to engage with emails.

In [None]:
col_to_plot='user_country'
data = df.groupby([col_to_plot,'email_text']).agg({'size':'sum','email_opened':'mean', 'clicked_link':'mean'}).reset_index()

fig, ax = plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot, y='size', hue="email_text", ax=ax[0])
ax[0].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[0].set_ylabel("Size")

sns.barplot(data=data, x=col_to_plot, y='email_opened', hue="email_text", ax=ax[1])
ax[1].axhline(y=grp_open_rate,color='r',linestyle='--')
ax[1].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[1].set_ylabel("Email Open Rate")
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))

sns.barplot(data=data, x=col_to_plot, y='clicked_link', hue="email_text", ax=ax[2])
ax[2].axhline(y=grp_link_rate,color='r',linestyle='--')
ax[2].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[2].set_ylabel("Link Click Rate")
ax[2].yaxis.set_major_formatter(PercentFormatter(xmax=1))


plt.tight_layout()
plt.show()

- **The US and UK also perform better in terms of clicks on the link within the email**. This suggests that not only are users in these countries more likely to open the email, but they also find the content compelling enough to click through to the company site.
- The low CTRs in Spain and France could indicate that **although users are opening the emails, the content or call-to-action within the email might not be engaging enough to prompt a click**.

- For US and UK:
    - Optimize further: Since these countries are performing well, **try segmenting these users further (e.g., by email version, past purchases, etc.) and target them with optimized content**.
- For Spain and France:
    - Test new strategies: Conduct A/B tests with different **email subjects**, **content formats**, or **personalized messages** to understand what drives better engagement.
    - Localized content: Ensure that the emails are culturally tailored and include **relevant content that resonates with these users**.
    - Consider alternate channels: If emails aren't resonating with users in these countries, **explore other marketing channels** (e.g., social media, SMS) to complement the email campaign.

In [None]:
col_to_plot='user_past_purchases'
data = df.groupby([col_to_plot,'email_text']).agg({'size':'sum','email_opened':'mean', 'clicked_link':'mean'}).reset_index()

fig, ax = plt.subplots(3,1, figsize=(20,12))
sns.barplot(data=data, x=col_to_plot, y='size', hue="email_text", ax=ax[0])
ax[0].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[0].set_ylabel("Size")

sns.barplot(data=data, x=col_to_plot, y='email_opened', hue="email_text", ax=ax[1])
ax[1].axhline(y=grp_open_rate,color='r',linestyle='--')
ax[1].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[1].set_ylabel("Email Open Rate")
ax[1].yaxis.set_major_formatter(PercentFormatter(xmax=1))

sns.barplot(data=data, x=col_to_plot, y='clicked_link', hue="email_text", ax=ax[2])
ax[2].axhline(y=grp_link_rate,color='r',linestyle='--')
ax[2].legend(loc='upper left', bbox_to_anchor=(1.01,1))
ax[2].set_ylabel("Link Click Rate")
ax[2].yaxis.set_major_formatter(PercentFormatter(xmax=1))


plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_curve, auc

In [None]:

# Label encoding for 'weekday' and 'user_country'
encoder = LabelEncoder()
df['weekday'] = encoder.fit_transform(df['weekday'])
df['user_country'] = encoder.fit_transform(df['user_country'])
df['email_text'] = encoder.fit_transform(df['email_text'])
df['email_version'] = encoder.fit_transform(df['email_version'])


In [None]:
# Preparing the data
X = df[['email_text', 'email_version', 'hour', 'weekday', 'user_country', 'user_past_purchases']]  # Features
y = df['clicked_link']  # Target variable

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
importances = model.feature_importances_
fig, ax= plt.subplots(figsize=(20,4))

sns.barplot(y=X.columns, x=importances, orient='h')
plt.show()

In [None]:
# Predict probabilities (not just class labels)
y_probs = model.predict_proba(X_test)[:, 1]  # Get the probability of the positive class

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Compute AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random classifier line
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
roc_auc

The AUC (Area Under the Curve) of the model is approximately 0.55, which is slightly above random guessing (an AUC of 0.5). This indicates that the model has some predictive power, but it's not performing very well in distinguishing between users who click the link and those who do not.

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)


A/B Testing: Once you have a model, you can implement it on a live email campaign by sending personalized emails based on the model’s predictions. Track the CTR of users in the predicted group versus those in the control group to measure the improvement.

Based on the model built previously, **the company can send email to selected users that are predicted to have higher probability to click the link**.

To test it, we can conduct A/B test to see if the built model can actually help improve the click-through-rate.

- Randomly assign users to two groups, Control group and Experiment group.
- In **Control group, still use the old email-campaign strategy, i.e., just send emails to all users in Control group**.
- In Experiment group, **use my model to predict whether the user will click the link or not. and only send emails to those users whose predictive result is positive.**
- Preform a one-tail unpaired t-test to test whether Experiment group's population proportion is higher than Control group's population proportion.


**Tools and Techniques for Optimizing Email Content**
- A/B Testing Tools:
    - Tools like Mailchimp, Optimizely, or SendGrid allow you to run A/B tests to optimize subject lines, CTAs, and other content elements.
- Analytics and Tracking:
    - Use email marketing tools that provide detailed analytics (open rates, click rates, bounce rates, etc.) to track the performance of your emails.
    - Analyze where users are clicking and how they interact with the content.
- Behavioral Trigger Emails:
    - Send emails based on user actions (e.g., abandoned cart emails, re-engagement emails, or product recommendation emails based on past purchases).
    - Use tools like Klaviyo, ActiveCampaign, or HubSpot for automated, behavior-driven campaigns.

A/B Testing: Experiment with different subject lines to find out which works best for your audience.

**Define the Hypothesis**
Start by defining what you want to test and the hypothesis you're trying to validate. For example:

- Hypothesis 1: "Using a personalized subject line will increase open rates."
- Hypothesis 2: "A short email format will lead to a higher click-through rate."

- T-Test: Used to compare the means of two groups (e.g., open rates for Email A vs Email B).
- Chi-Square Test: Used for categorical data (e.g., number of clicks in different groups).
- Z-Test: A variant of the T-Test for large samples.

When there is a noticeable difference in open rate between your test groups in an A/B test, it can affect the subsequent click-through rate (CTR) comparison. This is because CTR is calculated among users who have opened the email, and if the open rate between groups differs significantly, the groups will have different underlying populations. This difference in the number of users who opened the email could result in misleading conclusions about CTR.

**Adjusting the A/B Test Design**
Propensity score matching can be used to balance the open rate between the groups before comparing the CTR. This involves calculating the probability (propensity) that a user opens the email based on their characteristics and then matching users with similar probabilities between the two groups.

**Randomization and Stratification**
To ensure fair comparisons, you should also consider ensuring that users are randomly assigned to test groups, or use stratified sampling based on key features (such as user engagement, past behavior, etc.). This minimizes any inherent differences between the groups and ensures that the results of CTR comparisons are not influenced by confounding factors.

**Sequential Testing**
If you're running multiple tests over time, you can use sequential testing to monitor the results at regular intervals and ensure that you don’t over-collect data or introduce biases over time.