In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import warnings
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

# --- Load Data ---
email_table = pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\email_table.csv")
email_opened_table = pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\email_opened_table.csv")
link_clicked_table = pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\link_clicked_table.csv")

# --- Feature Engineering ---
email_table['clicked'] = email_table['email_id'].isin(link_clicked_table['email_id']).astype(int)
email_table['email_opened'] = email_table['email_id'].isin(email_opened_table['email_id']).astype(int)
email_table['opened_but_not_clicked'] = ((email_table['email_opened'] == 1) & (email_table['clicked'] == 0)).astype(int)

# --- Drop Less Useful Features (based on SHAP) ---
selected_columns = [
    'email_opened',
    'opened_but_not_clicked',
    'user_past_purchases',
    'email_text',
    'hour',
    'weekday',
    'email_version'
]

# --- Label Encoding ---
encoded_data = email_table.copy()
label_encoders = {}
categorical_cols = ['email_text', 'email_version', 'weekday']

for col in categorical_cols:
    le = LabelEncoder()
    encoded_data[col] = le.fit_transform(encoded_data[col])
    label_encoders[col] = le

# --- Interaction Features ---
encoded_data['text_version_interaction'] = encoded_data['email_text'] * 10 + encoded_data['email_version']

# --- Define X and y ---
feature_set = selected_columns + ['text_version_interaction']
X = encoded_data[feature_set]
y = encoded_data['clicked']

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Apply SMOTE (or RandomOverSampler) ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# # --- Model Training ---
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_resampled, y_resampled)

# # --- Evaluation ---
# y_pred = model.predict(X_test)
# y_proba = model.predict_proba(X_test)[:, 1]

# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
# print("Precision-Recall AUC:", average_precision_score(y_test, y_proba))

# # Define and train the XGBoost model
# xgb_model = XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=10, eval_metric='logloss')
# xgb_model.fit(X_train, y_train)

xgb_model = XGBClassifier(
    n_estimators=100,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_resampled, y_resampled)


# Predict and evaluate
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19576
           1       0.72      0.99      0.83       424

    accuracy                           0.99     20000
   macro avg       0.86      0.99      0.91     20000
weighted avg       0.99      0.99      0.99     20000

ROC AUC Score: 0.9929626598029162


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_curve, classification_report, roc_auc_score, recall_score, make_scorer
from xgboost import XGBClassifier

# 1) Load data
email_df = pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\email_table.csv")
opened_df = pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\email_opened_table.csv")
clicked_df =  pd.read_csv("C:\\Users\\ashvi\\Desktop\\Quantacus.Ai\\link_clicked_table.csv")

# 2) Create labels
email_df["opened"]  = email_df["email_id"].isin(opened_df["email_id"]).astype(int)
email_df["clicked"] = email_df["email_id"].isin(clicked_df["email_id"]).astype(int)

# 3) Feature matrix & target
cat_cols = ["email_text", "email_version", "weekday", "user_country"]
num_cols = ["hour", "user_past_purchases", "opened"]

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat = encoder.fit_transform(email_df[cat_cols])
X_num = email_df[num_cols].values
X = np.hstack([X_cat, X_num])
y = email_df["clicked"].values

# 4) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5) Hyperparameter tuning (focus on clicked recall)
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
param_grid = {
    "scale_pos_weight": [
        1,
        neg/pos,
        2*(neg/pos)
    ],
    "max_depth": [4, 6, 8],
    "n_estimators": [50, 100, 200],
    "gamma": [0, 1, 5]
}
scorer = make_scorer(recall_score, pos_label=1)

grid = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    param_grid,
    scoring=scorer,
    cv=3,
    n_jobs=-1
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)

# 6) Predict probabilities on test set
y_prob = best_model.predict_proba(X_test)[:, 1]

# 7) Optimize decision threshold for F1 on 'clicked' class
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * precisions * recalls / (precisions + recalls + 1e-9)
best_idx = np.nanargmax(f1_scores)
best_thresh = thresholds[best_idx]
print(f"Best threshold for F1: {best_thresh:.2f}, F1 = {f1_scores[best_idx]:.3f}")

# 8) Final predictions & evaluation
y_pred = (y_prob >= best_thresh).astype(int)

print("\nClassification Report at optimal threshold:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Best params: {'gamma': 5, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': np.float64(46.1976401179941)}
Best threshold for F1: 0.92, F1 = 0.400

Classification Report at optimal threshold:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     19576
           1       0.30      0.59      0.40       424

    accuracy                           0.96     20000
   macro avg       0.65      0.78      0.69     20000
weighted avg       0.98      0.96      0.97     20000

ROC AUC Score: 0.9660936259069635


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
