IMPLEMENTING  XGBOOST MODEL FOR CTR PREDICTION

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Loading the dataset
data = pd.read_csv(r" dataset.csv ")

# Encoding categorical data
label_encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = label_encoder.fit_transform(data[col])

X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad']

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create instance of XGBoost model
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform Grid Search with cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=model_xgb, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Cross-validation scores with the best model
scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')

# Model Training with the best model
best_model.fit(X_train, y_train)

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Performance metrics
average_accuracy = np.mean(scores)
print("XGBOOST MODEL")
print(f"Accuracy Score for each fold: {[round(score, 4) for score in scores]}")


accuracy2 = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy2:.2f}")

report = classification_report(y_test, y_pred)
print(report)
print(f"Best Parameters: {grid_search.best_params_}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBOOST MODEL
Accuracy Score for each fold: [0.879, 0.878, 0.8735, 0.86, 0.878]
Accuracy on test data: 0.88
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      1014
           1       0.87      0.88      0.88       986

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}


In [2]:
#XGBOOST USING BAGGING TECHNIQUES


from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the base XGBoost model
base_model = XGBClassifier(random_state=42) 

# Create the BaggingClassifier with the XGBoost model as the base estimator
bagging_model = BaggingClassifier(
    base_estimator=base_model, 
    n_estimators=10,  # Number of base models to create
    random_state=42
)

# Fit the BaggingClassifier to the training data
bagging_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy:.2f}")



Accuracy on test data: 0.89


IMPLEMTING CTR WITH DIFFERENT MODELS

In [None]:
#RANDOMM FOREST

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold

data = pd.read_csv(r"dataset.csv")

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through columns and encode object (string) type columns
for col in data.columns:
  if data[col].dtype == 'object':
    data[col] = label_encoder.fit_transform(data[col])

# Separate features (X) and target variable (y)
X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model
model_rf = RandomForestClassifier()

k=5
kf= KFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(model_rf, X, y, cv=kf, scoring='accuracy')

# Train the model
model_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_rf.predict(X_test)


# performance metrics
average_accuracy = np.mean(scores)
print(f"Accuracy Score for each fold: {[round(score, 4) for score in scores]}")
print(f"Average accuracy across {k} folds: {average_accuracy:.2f}")

accuracy1 = accuracy_score(y_test, y_pred)
print(f"RANDOM FOREST-Accuracy: {accuracy1:.2f}")
report = classification_report(y_test, y_pred)
print(report)

Accuracy Score for each fold: [0.8405, 0.8405, 0.8295, 0.8355, 0.8365]
Average accuracy across 5 folds: 0.84
RANDOM FOREST-Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1014
           1       0.86      0.83      0.85       986

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [None]:
#DECISION TREE

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv(r"dataset.csv")

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through columns and encode object (string) type columns
for col in data.columns:
  if data[col].dtype == 'object':
    data[col] = label_encoder.fit_transform(data[col])

# Separate features (X) and target variable (y)
X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"DECISION TREE-Accuracy: {accuracy:.2f}")
report = classification_report(y_test, y_pred)
print(report)

DECISION TREE-Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1014
           1       0.78      0.78      0.78       986

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.78      0.78      0.78      2000



In [None]:
#LOGISTIC REGRESSION

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

data = pd.read_csv(r"dataset.csv")

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through columns and encode object (string) type columns
for col in data.columns:
  if data[col].dtype == 'object':
    data[col] = label_encoder.fit_transform(data[col])

# Separate features (X) and target variable (y)
X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"LOGISTIC REGRESSION-Accuracy: {accuracy:.2f}")
report = classification_report(y_test, y_pred)
print(report)

LOGISTIC REGRESSION-Accuracy: 0.64
              precision    recall  f1-score   support

           0       0.64      0.68      0.66      1014
           1       0.65      0.61      0.63       986

    accuracy                           0.64      2000
   macro avg       0.64      0.64      0.64      2000
weighted avg       0.64      0.64      0.64      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
