In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
# Step 1: Generate synthetic dataset
np.random.seed(42)
num_rows = 20000
data = {
    'user_id': range(num_rows),
    'age': np.random.randint(18, 65, num_rows),
    'gender': np.random.choice(['Male', 'Female'], num_rows),
    'purchasing_pattern': np.random.choice(['High', 'Medium', 'Low'], num_rows),
    'online_behavior_score': np.random.uniform(0, 1, num_rows),
    'ad_clicks': np.random.randint(0, 10, num_rows)
}
df = pd.DataFrame(data)
df.to_csv('targeted_advertising_data.csv', index=False)
print("Sample dataset saved as 'targeted_advertising_data.csv'.")

# Step 2: Load and explore the dataset
df = pd.read_csv('targeted_advertising_data.csv')
print(df.head())
print(df.info())
print(df.describe())

# Step 3: Data preprocessing
le = OneHotEncoder(drop='first')
X_encoded = le.fit_transform(df[['gender', 'purchasing_pattern']])
X = pd.concat([df.drop(['gender', 'purchasing_pattern'], axis=1), pd.DataFrame(X_encoded.toarray(), columns=le.get_feature_names_out())], axis=1)
scaler = StandardScaler()
X[['age', 'online_behavior_score']] = scaler.fit_transform(X[['age', 'online_behavior_score']])
y = df['ad_clicks']

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 4: Train and tune models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=100, multi_class='ovr'),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

param_grid = {
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Random Forest': {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, 30]},
    'XGBoost': {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2]}
}

for name, model in models.items():
    print(f"--- Tuning {name} ---")
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    auc_score = roc_auc_score(y_test, best_model.predict_proba(X_test), multi_class='ovr')
    print(f"Best AUC: {auc_score}")
    print(f"Best Model Parameters: {grid_search.best_params_}")
    print("\n")

# Save the best model (XGBoost assumed to be best)
joblib.dump(best_model, 'best_ad_model.sav')
print("Best model saved as 'best_ad_model.sav'.")


Sample dataset saved as 'targeted_advertising_data.csv'.
   user_id  age  gender purchasing_pattern  online_behavior_score  ad_clicks
0        0   56    Male             Medium               0.043978          5
1        1   46    Male                Low               0.820686          0
2        2   32    Male             Medium               0.199031          5
3        3   60  Female                Low               0.531554          4
4        4   25    Male                Low               0.248763          1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                20000 non-null  int64  
 1   age                    20000 non-null  int64  
 2   gender                 20000 non-null  object 
 3   purchasing_pattern     20000 non-null  object 
 4   online_behavior_score  20000 non-null  float64
 5   ad_c