In [0]:
#!pip install xgboost
#!pip install scipy
#!pip install Jinja2
#!pip install category-encoders
#!pip install Boruta

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, average_precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, matthews_corrcoef
from scipy.stats import mannwhitneyu
import category_encoders as ce
from boruta import BorutaPy


**Modeling logic**

At first glance, the task appears to be about creating a dataset and training a predictive model such that, given a user profile and the characteristics of an offer, the model can estimate the probability that a transaction will occur. In other words, we want to model:

**_P(Transaction ∣ User Profile,Offer)_**


However, this is not sufficient for understanding the **true effectiveness of offers**. What we really care about is the **causal impact** of giving an offer, i.e., whether the offer actually changes user behavior compared to what they would have done without the offer.  

This brings us to the concept of **uplift modeling**. Instead of just predicting the probability of a transaction under treatment (receiving the offer), we want to measure the **difference** between:  

1. The probability that a transaction occurs if the user receives the offer.  
2. The probability that a transaction occurs if the same user does **not** receive the offer.  

The second probability depends only on the user’s intrinsic profile and past behavior (their “baseline” propensity to transact).  

Formally, the **uplift** can be written as:

**Uplift(x)=P(Transaction∣x,Offer) − P(Transaction∣x,No Offer)**

Thus, the problem is not just predictive, but **causal**: we want to isolate the incremental effect of the offer on transactions. A positive uplift indicates that the offer increases the likelihood of a transaction, while a negative uplift means the offer may actually discourage transactions or simply attract users who would have transacted anyway (cannibalization).

Here I first start with some data analysis. 


**Plan**:
- Data Analysis for both datasets: **labeled_data_with_offer.csv** and **user_profiles_transaction_classes.csv**
- Feature engineering if needed
- Treating missing values
- Categorized to numerical data convertion
- Data split and modeling (take care of data leakage if any)
- Metric choice (Recall, precision, AUC, AVG.precision)
- scale_pos_weight setting if needed
- Important feature analysis using Boruta

In [0]:

# ------------------------------Only to load from databricks volumes -------------------------------
#user_files = glob.glob("/Volumes/workspace/default/data/user_profiles_transaction_classes.csv/part-*.csv")
#user_transaction_profile_df = pd.concat((pd.read_csv(f) for f in user_files), ignore_index=True)

#offer_files = glob.glob("/Volumes/workspace/default/data/labeled_data_with_offer.csv/part-*.csv")
#offer_related_transaction_profile_df = pd.concat((pd.read_csv(f) for f in offer_files), ignore_index=True)

#user_transaction_profile_df.to_csv("data/user_profiles_transaction_classes.csv")
#offer_related_transaction_profile_df.to_csv("data/offer_user_profiles_transaction_classes.csv")
#----------------------------------------------------------------------------------------------------

user_transaction_profile_df = pd.read_csv("data/processed/user_profiles_transaction_classes.csv")
offer_related_transaction_profile_df = pd.read_csv("data/processed/offer_user_profiles_transaction_classes.csv")


## Data Analysis

### User profile <> Transaction analysis

In [0]:
user_transaction_profile_df

In [0]:
offer_related_transaction_profile_df.shape

In [0]:
counts = (
    user_transaction_profile_df
    .groupby(["gender", "class"])
    .size()
    .reset_index(name="count")
)

counts["proportion"] = counts.groupby("gender")["count"].transform(lambda x: x / x.sum())

print(counts)

In [0]:
def plot_distribution_per_class(df, feature, class_name):
    plt.figure(figsize=(8,5))

    for c in df[class_name].unique():
        subset = df[df[class_name] == c]
        plt.hist(
            subset[feature].dropna(),
            bins=30,
            density=True,
            alpha=0.5,
            label=f"Class {c}"
        )
        
        mean_val = subset[feature].mean()
        plt.axvline(mean_val, color="blue" if c==0 else "red", linestyle="--", linewidth=1.5, label=f"Mean (Class {c})")
        
        median_val = subset[feature].median()
        plt.axvline(median_val, color="blue" if c==0 else "red", linestyle="-.", linewidth=1.5, label=f"Median (Class {c})")

    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.title(f"Probability density of {feature} per Class")
    plt.legend()
    plt.show()

In [0]:
plot_distribution_per_class(user_transaction_profile_df, "credit_card_limit",  "class")

In [0]:
group0 = user_transaction_profile_df.loc[user_transaction_profile_df["class"] == 0, "credit_card_limit"].dropna()
group1 = user_transaction_profile_df.loc[user_transaction_profile_df["class"] == 1, "credit_card_limit"].dropna()

print("Group0 size:", len(group0))
print("Group1 size:", len(group1))
print("Group0 unique values:", group0.unique())
print("Group1 unique values:", group1.unique())
print("Group0 dtypes:", group0.dtype)
print("Group1 dtypes:", group1.dtype)
print("Group0 has inf:", np.isinf(group0).any())
print("Group1 has inf:", np.isinf(group1).any())


 -->**This shows that the data are very likely manufactured data and are not real. Same values repeats over and over**

In [0]:
result = mannwhitneyu(group0, group1, alternative="two-sided")
print(result.statistic, result.pvalue)


--> **This p-value shows that the data rank for credit limit are infact different and statistically significant**

In [0]:
group0 = user_transaction_profile_df.loc[user_transaction_profile_df["class"] == 0, "age"].dropna()
group1 = user_transaction_profile_df.loc[user_transaction_profile_df["class"] == 1, "age"].dropna()

print("Group0 size:", len(group0))
print("Group1 size:", len(group1))
print("Group0 unique values:", group0.unique())
print("Group1 unique values:", group1.unique())
print("Group0 dtypes:", group0.dtype)
print("Group1 dtypes:", group1.dtype)
print("Group0 has inf:", np.isinf(group0).any())
print("Group1 has inf:", np.isinf(group1).any())

In [0]:
plot_distribution_per_class(user_transaction_profile_df, "age",  "class")

In [0]:
result = mannwhitneyu(group0, group1, alternative="two-sided")
print(result.statistic, result.pvalue)

--> **This p-value shows that the data rank for age are infact different and statistically significant**

Note: These features are very abnormal. Very high credit limits, people with age of 120. In real world these values dont make sense

In [0]:
# Lets evaluate the variable correlation here:
corr = user_transaction_profile_df.drop(["class", "registered_on"], axis = 1).corr()
corr.style.background_gradient(cmap='coolwarm')

Lets analyze the registration years as well. We want to answer: do people who registered earlier have a higher probability of having a transaction?

In [0]:
user_transaction_profile_df["registered_on"] = pd.to_datetime(user_transaction_profile_df["registered_on"], format="%Y%m%d")
user_transaction_profile_df["year"] = user_transaction_profile_df["registered_on"].dt.year

In [0]:
plot_distribution_per_class(user_transaction_profile_df, "year", "class")

This is interesting because it shows that those erlier users are more faithful to the company than those who are newer and they buy more

### User profile <> Offer <> Transaction analysis

#### User profile analysis per class

In [0]:
offer_related_transaction_profile_df.head()

In [0]:
plot_distribution_per_class(offer_related_transaction_profile_df, "age", "offer_led_to_transaction")

In [0]:
group0 = offer_related_transaction_profile_df.loc[offer_related_transaction_profile_df["offer_led_to_transaction"] == 0, "age"].dropna()
group1 = offer_related_transaction_profile_df.loc[offer_related_transaction_profile_df["offer_led_to_transaction"] == 1, "age"].dropna()

print("Group0 size:", len(group0))
print("Group1 size:", len(group1))
print("Group0 unique values:", group0.unique())
print("Group1 unique values:", group1.unique())
print("Group0 dtypes:", group0.dtype)
print("Group1 dtypes:", group1.dtype)
print("Group0 has inf:", np.isinf(group0).any())
print("Group1 has inf:", np.isinf(group1).any())

In [0]:
result = mannwhitneyu(group0, group1, alternative="two-sided")
print(result.statistic, result.pvalue)

In [0]:
plot_distribution_per_class(offer_related_transaction_profile_df, "credit_card_limit", "offer_led_to_transaction")

In [0]:
# Here I want to know if there is a gender difference in the offer led to transaction
gender_success_rate = (
    offer_related_transaction_profile_df
    .dropna(subset=["gender"])
    .groupby("gender")["offer_led_to_transaction"]
    .mean()
    .reset_index(name="success_rate")
)

plt.figure(figsize=(6,4))
sns.barplot(
    data=gender_success_rate,
    x="gender",
    y="success_rate"
)
plt.title("Proportion of Offers Leading to Transaction by Gender")
plt.xlabel("Gender")
plt.ylabel("Proportion of Transactions (Success Rate)")
plt.ylim(0,1)
plt.show()

There is no gender difference in accepting or not accepting an offer. Both genders used the offers more than not using

In [0]:
## Analyzing the proportion of Offers Leading to Transaction by Offer Type
offer_success_rate = (
    offer_related_transaction_profile_df
    .groupby("offer_type")["offer_led_to_transaction"]
    .mean()
    .reset_index(name="success_rate")
)

plt.figure(figsize=(7,5))
ax = sns.barplot(
    data=offer_success_rate,
    x="offer_type",
    y="success_rate",
    palette="plasma"
)

for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2%}",
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha="center", va="bottom",
        fontsize=12, color="black", weight="bold"
    )

plt.title("Proportion of Offers Leading to Transaction by Offer Type", fontsize=14, weight="bold")
plt.xlabel("Offer Type", fontsize=12)
plt.ylabel("Success Rate", fontsize=12)
plt.ylim(0, 1)
plt.xticks(fontsize=11)
plt.yticks(np.linspace(0,1,6), [f"{x:.0%}" for x in np.linspace(0,1,6)], fontsize=11)  # show % on y-axis
sns.despine()

plt.show()


This is interesting. The offers which have a discount or those of Buy One, Get One (BOGO) has higher conversion rate to a transaction.

In [0]:
plot_distribution_per_class(offer_related_transaction_profile_df, "min_value", "offer_led_to_transaction")

In [0]:
plot_distribution_per_class(offer_related_transaction_profile_df, "discount_value", "offer_led_to_transaction")

In [0]:
## Analyzing the average Discount Value by Offer Led to Transaction
discount_means = (
    offer_related_transaction_profile_df
    .groupby("offer_led_to_transaction")["discount_value"]
    .mean()
    .reset_index()
)

plt.figure(figsize=(6,5))
ax = sns.barplot(
    data=discount_means,
    x="offer_led_to_transaction",
    y="discount_value",
    palette="viridis"
)

for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2f}",
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha="center", va="bottom",
        fontsize=12, color="black", weight="bold"
    )

plt.title("Average Discount Value by Offer Led to Transaction", fontsize=14, weight="bold")
plt.xlabel("Offer Led to Transaction", fontsize=12)
plt.ylabel("Average Discount Value", fontsize=12)
plt.xticks([0,1], ["No (0)", "Yes (1)"], fontsize=11)
plt.yticks(fontsize=11)
sns.despine()

plt.show()

Interstingly, those offers which led to transaction have higher vdiscount values.

In [0]:
# Analyzing which communication channel set has been more effective in marketing and offer campaign
offer_success_rate = (
    offer_related_transaction_profile_df
    .groupby("channels")["offer_led_to_transaction"]
    .mean()
    .reset_index(name="success_rate")
)

plt.figure(figsize=(7,5))
ax = sns.barplot(
    data=offer_success_rate,
    x="channels",
    y="success_rate",
    palette="plasma"
)

for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2%}",
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha="center", va="bottom",
        fontsize=12, color="black", weight="bold"
    )

plt.title("Proportion of channel set Leading to Transaction by Offer Type", fontsize=14, weight="bold")
plt.xlabel("Channels", fontsize=12)
plt.ylabel("Success Rate", fontsize=12)
plt.ylim(0, 1)
plt.xticks(fontsize=9)
plt.yticks(np.linspace(0,1,6), [f"{x:.0%}" for x in np.linspace(0,1,6)], fontsize=11)
sns.despine()

plt.show()


Only web and mail are enough to grab users attention

In [0]:
# Analyzing how the duration of the offer has affected the success rate
discount_means = (
    offer_related_transaction_profile_df
    .groupby("offer_led_to_transaction")["duration"]
    .mean()
    .reset_index()
)

plt.figure(figsize=(6,5))
ax = sns.barplot(
    data=discount_means,
    x="offer_led_to_transaction",
    y="duration",
    palette="viridis"
)

for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2f}",
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha="center", va="bottom",
        fontsize=12, color="black", weight="bold"
    )

plt.title("Average Offer duration Led to Transaction", fontsize=14, weight="bold")
plt.xlabel("Offer Led to Transaction", fontsize=12)
plt.ylabel("Average offer duration", fontsize=12)
plt.xticks([0,1], ["No (0)", "Yes (1)"], fontsize=11)
plt.yticks(fontsize=11)
sns.despine()

plt.show()

Those offers that have higher duration more likely lead to a transaction.

In [0]:
corr = offer_related_transaction_profile_df.drop(["offer_led_to_transaction", "offer_id", "account_id", "gender", "channels", "offer_type", ], axis = 1).corr()
corr.style.background_gradient(cmap='coolwarm')

This is mostly a good news because we do not have the risc of singularity and model unstability. For trees, we dont have the effect of biased featurte importance, or redundant random splits

## Modeling

I will train two models, one for offer-userprofile transaction probability and the other one which accounts for no-offer probability using the user profile characteristics. 

In [0]:
## I will keep the X_holdout and y_holdout for my final uplift test

y = offer_related_transaction_profile_df["offer_led_to_transaction"]
X = offer_related_transaction_profile_df.drop(columns=["offer_led_to_transaction", "offer_id", "account_id"])

## ---- separating a test dataset for uplift calculation -----------------

X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## -----------------------------------------------------------------------

#### 1. Offer - transaction `predictor`

**Categorical to numeric data conversion**

Here I have several choices:
1. Simple category to number conversion
  * **Pros**: simple
  * **Cons**: the numbers have no meaning at all, and no order exist.

2. one-hot encoding
  * **Pros**: preserves information about categories 
  * **Cons**: a) adds high dimensionality to the data; b) makes data sparse; c) when used with trees, it cause the tree to grow in the direction of zeros, to split redundantly, and to overfit due to tree complexity.

3. using target-encoding
  * **Pros**: when there is a strong relation between categories and the target, it helps the model to learn better and faster.
  * **Cons**: Data leackage: if not handled correctly, it might leak target information and overestimate the model performance.

  I will choose target-encoding due to the analysis results I had earlier that shows some categories are infact related to a specific target. For features like gender no encoding type will help due to the lack of correlation with the target.


  **Algorithm selection**
  - I chose a boosting algorithm called **Tree Xgboost**. It is an evolution of the Gradient Boosting algorithm which offers some benefits in addition to all the ensemble and error correction abilities of boosting methods:
    - It uses Quantile-based Candidate Splits which improves the training time (it borrowed this feature from histogram binning of the LightGBM).
    - Sparcity-aware-split-finding which enables the Xgboost to handle NaN values natively.
    - Parallel Learning
  
  **Cross-validation**
  - I used Stratified-k-fold due to its ability to create stratified folds which is more suitable for imbalanced classes. I could have also used Repeated class-wise stratified K-Fold in order to shuffle more and have a more realistic and rebust estimation

  **No probability callibration**
  - No probability callibration needed due to binary logistic loss (log loss).

  **k-fold where k=5**
   - The higher k, the lower the bias and higher the variance. Also the lower k, the higher model bias and higher the variance. This depends on the data size. Generally a k=10 or 5 is more common to use.

In [0]:
def train_and_test(X, y, categorical_cols, threshold = 0.5):
    """
    Train and evaluate an XGBoost classifier with stratified 5-fold CV.
    
    Parameters
    ----------
    X : pd.DataFrame
        Feature matrix
    y : pd.Series
        Target variable (binary)
    categorical_cols : list
        List of categorical column names to be target-encoded
    threshold : float
        Classification threshold (default 0.5)
    """

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    avg_precision, precisions, recalls, f1s, aucs, tprs, pr_aucs, mccs  = [], [], [], [], [], [], [], []
    mean_fpr = np.linspace(0, 1, 100)
    mean_recall = np.linspace(0, 1, 100)
    precisions_interp = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train_kfold, X_test_kfold = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train_kfold, y_test_kfold = y.iloc[train_index], y.iloc[test_index]
        neg, pos = np.bincount(y_train_kfold)
        scale_pos_weight_param = neg/pos

        print(f"\nFold {fold+1}:")
        print(f"  Train set class distribution: {np.bincount(y_train_kfold)}")
        print(f"  Test set class distribution: {np.bincount(y_test_kfold)}")

        # Target encoding (fit only on training fold) so that I can avoid target leackage
        target_encoder = ce.TargetEncoder(cols=categorical_cols)
        X_train_enc = target_encoder.fit_transform(X_train_kfold, y_train_kfold)
        X_test_enc = target_encoder.transform(X_test_kfold)


        ## I will use binary logistic because it is naturally callibrated loss
        model = XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            reg_alpha=1,
            reg_lambda=1,
            max_depth=4,
            n_estimators=200,
            scale_pos_weight = scale_pos_weight_param
        )
        model.fit(X_train_enc, y_train_kfold)

        
        y_pred_proba = model.predict_proba(X_test_enc)[:, 1]
        y_pred = (y_pred_proba > threshold).astype(int)

        precisions.append(precision_score(y_test_kfold, y_pred))
        recalls.append(recall_score(y_test_kfold, y_pred))
        f1s.append(f1_score(y_test_kfold, y_pred))
        mccs.append(matthews_corrcoef(y_test_kfold, y_pred))
        avg_precision.append(average_precision_score(y_test_kfold, y_pred_proba))
        auc = roc_auc_score(y_test_kfold, y_pred_proba)
        aucs.append(auc)

        fpr, tpr, _ = roc_curve(y_test_kfold, y_pred_proba)
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        prec, rec, _ = precision_recall_curve(y_test_kfold, y_pred_proba)
        pr_auc = average_precision_score(y_test_kfold, y_pred_proba)
        pr_aucs.append(pr_auc)

        precisions_interp.append(np.interp(mean_recall, rec[::-1], prec[::-1])) 


    plt.figure(figsize=(7, 6))
    for i, tpr in enumerate(tprs):
        plt.plot(mean_fpr, tpr, alpha=0.3, label=f"Fold {i+1}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.plot(mean_fpr, np.mean(tprs, axis=0), color="b", lw=2,
            label=f"Mean ROC (AUC={np.mean(aucs):.3f})")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Cross-Validated AUC-ROC Curve")
    plt.legend()
    plt.show()

    plt.figure(figsize=(7, 6))
    for i, prec in enumerate(precisions_interp):
        plt.plot(mean_recall, prec, alpha=0.3, label=f"Fold {i+1}")
    plt.plot(mean_recall, np.mean(precisions_interp, axis=0), color="r", lw=2,
            label=f"Mean PR (AP={np.mean(pr_aucs):.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Cross-Validated Precision-Recall Curve")
    plt.legend()
    plt.show()

    print("Average ROC AUC:", np.mean(aucs))
    print("Average PR AUC:", np.mean(pr_aucs))
    print("Average precision:", np.mean(precisions))
    print("Average recall:", np.mean(recalls))
    print("Average f1:", np.mean(f1s))
    print("Mathews Correlation Coefficient (MCC):", np.mean(mccs))

In [0]:
train_and_test(X_train_full, y_train_full, ["gender", "channels", "offer_type"], threshold= 0.3)

Here I am using Boruta as it is an improved version of feature permutation importance approach. It is not the same but has a similar idea

In [0]:
## Feature importance using Boruta.
def boruta_feature_importance(X: pd.DataFrame, y: pd.Series, max_iter: int = 100, random_state: int = 42):
    """
    Run Boruta for feature importance and plot results.
    
    Parameters
    ----------
    X : pd.DataFrame
        Feature matrix
    y : pd.Series
        Target variable (binary/multi-class)
    max_iter : int
        Maximum number of iterations for Boruta
    random_state : int
        Random seed for reproducibility
    
    Returns
    -------
    feature_ranks : pd.DataFrame
        DataFrame with features and their Boruta ranks
    """
    
    rf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
    boruta = BorutaPy(rf, n_estimators="auto", verbose=0, random_state=random_state, max_iter=max_iter)
    
    boruta.fit(X.values, y.values)
    
    feature_ranks = pd.DataFrame({
        "feature": X.columns,
        "rank": boruta.ranking_,
        "support": boruta.support_,
        "tentative": boruta.support_weak_
    }).sort_values(by="rank")
    
    def categorize(row):
        if row["support"]:
            return "Strong"
        elif row["tentative"]:
            return "Tentative"
        else:
            return "Weak"
    
    feature_ranks["category"] = feature_ranks.apply(categorize, axis=1)
    
    plt.figure(figsize=(10,6))
    sns.barplot(
        data=feature_ranks,
        x="rank", y="feature", hue="category",
        dodge=False, palette={"Strong":"green", "Weak":"red", "Tentative":"orange"}
    )
    
    plt.title("Boruta Feature Importance (Strong, Weak, Tentative)", fontsize=14, weight="bold")
    plt.xlabel("Boruta Rank (1 = Most Important)")
    plt.ylabel("Features")
    plt.legend(title="Feature Category")
    plt.tight_layout()
    plt.show()
    
    return feature_ranks

In [0]:
categorical_cols = ["gender", "offer_type", "channels"]
target_encoder = ce.TargetEncoder(cols=categorical_cols)
X_train_enc = target_encoder.fit_transform(X_train_full, y_train_full)
X_train_enc_clean = X_train_enc.dropna()
y_train_enc_clean = y_train_full.loc[X_train_enc_clean.index]

boruta_feature_importance(X_train_enc_clean, y_train_enc_clean)

#### 2. Profile - transaction `predictor`

In [0]:
user_transaction_profile_df["registered_on"] = pd.to_datetime(user_transaction_profile_df["registered_on"], format="%Y%m%d")
user_transaction_profile_df["year"] = user_transaction_profile_df["registered_on"].dt.year

user_transaction_profile_df.columns

In [0]:
X = user_transaction_profile_df.drop(["account_id", "registered_on", 'class'], axis = 1)
y = user_transaction_profile_df['class']
train_and_test(X, y, ["gender"], threshold = 0.3)

In [0]:
target_encoder = ce.TargetEncoder(cols=["gender"])
X_train_enc = target_encoder.fit_transform(X, y)
X_train_enc_clean = X_train_enc.dropna()
y_train_enc_clean = y.loc[X_train_enc_clean.index]

boruta_feature_importance(X_train_enc_clean, y_train_enc_clean)

**Risks and shortcomming in the current modeling strategy:**
* Possible dataleackage due to sasonal patterns: the reaction to an offer or general tendency to buy can change depending on the month, day of month or economical situations of the region. Due to the lack of the transaction date and user profile region, some future information may have been leacked to the train data. The correct way to do this in this problem would be to separate the the test data in a temporal manner, and using only the future data in the test set.
* The current model could be improved by hyperparameter tuning (using Random Rearch or Baysian tuning) but I didnt proceed with this step due to the lack of the time.
* Use of the boosting algorithm can make the model worst if there is a significant proportion of data that are outliers. Boosting puts more weight on misclassified points. If those misclassifications are actually due to noise or mislabeled data, the model wastes capacity and degrades performance.
* Like any other tree method, it can not extrapolate beyound the training data. This is crucial when there are complex patterns in data related to offer-to-transaction classes.
* The size of the uplift model is very small and therefore I think it is not reliable


## Model impact on holdout data

In [0]:
# First lets train the model and generate the results with the model and create a cost comparison compared to naive way of sending offers to everyone.
neg, pos = np.bincount(y_train_full)
scale_pos_weight_param = neg/pos

categorical_cols = ["gender", "offer_type", "channels"]
target_encoder = ce.TargetEncoder(cols=categorical_cols)
X_train_enc = target_encoder.fit_transform(X_train_full.drop('Unnamed: 0', axis = 1), y_train_full)

model = XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            reg_alpha=1,
            reg_lambda=1,
            max_depth=4,
            n_estimators=200,
            scale_pos_weight = scale_pos_weight_param
        )
model.fit(X_train_enc, y_train_full)

In [0]:
X_holdout = X_holdout.drop('Unnamed: 0', axis = 1)
X_holdout_transformed = target_encoder.transform(X_holdout)

In [0]:
predictions = model.predict(X_holdout_transformed)

In [0]:
baseline_conversion_rate = y_holdout.mean()

true_positives = np.sum((predictions == 1) & (y_holdout == 1))
predicted_positives = np.sum(predictions == 1)

model_conversion_rate = true_positives / predicted_positives if predicted_positives > 0 else 0

total_users = len(y_holdout)
saved_offers = total_users - predicted_positives

cost_per_offer = 0.05  
savings = saved_offers * cost_per_offer

print(f"Baseline conversion rate (everyone targeted): {baseline_conversion_rate:.2%}")
print(f"Model conversion rate (targeted): {model_conversion_rate:.2%}")
print(f"Total users: {total_users}")
print(f"Predicted positives (offers sent): {predicted_positives}")
print(f"True positives captured: {true_positives}")
print(f"Saved offers: {saved_offers} → Estimated savings = ${savings:,.2f}")

This shows that if we had a set of users whose buying tendency was unknown to us, this model would help us not to send an offer to arround 5020 individuals. Here I used a threshold of 0.5 as my default threshold. The extent to which we would increase or decrease this threshold depends on our marketing goals. 

Custo de envio de um SMS: 10 centavos

Custo de envio de um email: 0.4 centavos

Nesses 12658 usuários, a gente teria economizado cerca de **R$522** reais

Ifood tem 55 milhões de clientes. Vamos pensar que 1MM desses usuários fossem escolhidos de forma aleatória e que a companhia enviasse email e SMS para todos esses usuários sem usar nenhum modelo.

1000,000×5,249/12,658 ​≈ **414678.000** envios poupados

Economia de 414678.000 * R$0.1 + 414678.000 * R$0.004 = **R$43.126** economizado.


## Uplifting

In [0]:
user_profile_cols = ["age", "credit_card_limit", "registered_on"]

categorical_cols_profile = ["gender"]
X = user_transaction_profile_df.drop(["account_id", "registered_on", 'class', 'Unnamed: 0', 'year'], axis = 1)
y = user_transaction_profile_df['class']
neg, pos = np.bincount(y)
scale_pos_weight_param_profile = neg/pos

target_encoder = ce.TargetEncoder(cols=categorical_cols_profile)
X_train_enc_profile = target_encoder.fit_transform(X, y)

user_transaction_model = XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            reg_alpha=1,
            reg_lambda=1,
            max_depth=4,
            n_estimators=200,
            scale_pos_weight = scale_pos_weight_param_profile
)
user_transaction_model.fit(X_train_enc_profile, y)

In [0]:
X_holdout_uplift_test = X_holdout[X_train_enc_profile.columns]

In [0]:
X_holdout_uplift_test = target_encoder.transform(X_holdout_uplift_test)
predictions_profile_only_model = user_transaction_model.predict(X_holdout_uplift_test)

### Uplift calculation

In [0]:
uplift_df = pd.DataFrame({
    "p_with_offer": predictions,
    "p_without_offer": predictions_profile_only_model,
})

uplift_df["uplift"] = uplift_df["p_with_offer"] - uplift_df["p_without_offer"]

avg_uplift = uplift_df["uplift"].mean()

print(f"Average uplift (overall effect of sending offers): {avg_uplift:.2%}")

This shows that sending offer on average increases the transaction probability by **8.3%**

### The impact  of the model on choosing the offer

Here, I will take instances from the hold out dataset and shuffle only the offer features and see how the model True Positives change. This shows the value that the model can bring compared to the baseline approach when the company randomly sends the offers.

In [0]:
X_holdout_transformed

In [0]:
X_holdout_transformed_shuffled = X_holdout_transformed.copy()
columns_to_shuffle = ['channels', 'discount_value', 'duration', 'min_value', 'offer_type']
for column in columns_to_shuffle:
    X_holdout_transformed_shuffled[column] = X_holdout_transformed_shuffled[column].sample(frac=1).values


In [0]:
suffled_offers_predictions = model.predict(X_holdout_transformed_shuffled)

In [0]:
predicted_positives = np.sum((suffled_offers_predictions == 1) & (y_holdout == 1))
conversion_rate_for_random_offers = predicted_positives / np.sum(y_holdout == 1) if predicted_positives > 0 else 0

In [0]:
print(f"Conversion rate when the company sent random offers: {conversion_rate_for_random_offers:.2%}")

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import imageio

# Example DataFrame
df = pd.DataFrame({
    "age": [55, 72, 49, 118, 79],
    "credit_card_limit": [63000, 96000, 60000, None, 113000],
    "gender": [0.82, 0.83, 0.82, 0.76, 0.82],
    "registered_on": [20151013, 20160503, 20151025, 20160926, 20160706],
    "channels": ["web,email,mobile", "web,mobile", "mobile,email", "web", "email"],
    "discount_value": [10, 5, 5, 5, 5],
    "duration": [7, 7, 10, 7, 7],
    "min_value": [10, 5, 20, 5, 5],
    "offer_type": ["informational", "bogo", "Discount", "informational", "bogo"]
})

cols_to_shuffle = ["channels", "discount_value", "duration", "min_value", "offer_type"]

frames = []

for i in range(5):  # create 5 shuffle states
    temp = df.copy()
    # Shuffle only selected columns
    for col in cols_to_shuffle:
        temp[col] = np.random.permutation(temp[col].values.copy())
    
    # Plot table for visualization
    fig, ax = plt.subplots(figsize=(10, 3))
    ax.axis("off")
    ax.axis("tight")
    ax.table(cellText=temp.values, colLabels=temp.columns, loc="center", cellLoc='center')
    plt.tight_layout()
    
    # Save frame as image
    fname = f"imgs/frame_{i}.png"
    plt.savefig(fname)
    plt.close(fig)
    frames.append(imageio.imread(fname))

# Save as GIF
imageio.mimsave("imgs/shuffle_demo.gif", frames, duration=1)
