# 01. Load the Dataset

In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/advertising/advertising.csv")
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [2]:
df.shape

(1000, 10)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


# 02. Check Class Distribution

In [4]:
df['Clicked on Ad'].value_counts()

Clicked on Ad
0    500
1    500
Name: count, dtype: int64

In [5]:
df['Clicked on Ad'].value_counts(normalize=True) * 100

Clicked on Ad
0    50.0
1    50.0
Name: proportion, dtype: float64

# 03. Train / Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = df[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Male']]
y = df['Clicked on Ad']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
X_train.shape, X_test.shape

((800, 5), (200, 5))

# 04. Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [11]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
X_train_scaled[:5], X_test_scaled[:5]

(array([[-1.00326609, -0.47592788, -1.07451316, -1.3690084 , -0.97043679],
        [ 0.83846843,  0.21195343,  0.17604694,  1.63718894, -0.97043679],
        [-1.89571151,  0.44124719, -0.70589945, -0.66623723,  1.03046381],
        [ 0.38687715, -0.70522164,  1.06561864,  1.01461045, -0.97043679],
        [ 1.19910986,  1.35842227, -0.6435648 , -1.00985486,  1.03046381]]),
 array([[-1.48770037,  0.67054096, -0.24715065, -1.17503807,  1.03046381],
        [ 0.48919573, -0.36128099,  0.81723285,  1.37582021,  1.03046381],
        [ 1.49722326, -0.47592788, -0.86742743,  0.71874287,  1.03046381],
        [ 0.56877685, -0.47592788,  1.25338347,  0.68835647, -0.97043679],
        [-1.35253878,  0.21195343, -2.58588416, -0.22666255, -0.97043679]]))

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# 05. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

In [16]:
yy_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:,1]

In [17]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Logistic Regression Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

NameError: name 'y_pred' is not defined

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 06. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dtree.fit(X_train_scaled, y_train)

In [None]:
y_pred = dtree.predict(X_test_scaled)
y_proba = dtree.predict_proba(X_test_scaled)[:,1]

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Decision Tree Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 07. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_scaled, y_train)

In [None]:
y_pred = rf.predict(X_test_scaled)
y_proba = rf.predict_proba(X_test_scaled)[:,1]

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Random Forest Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 08. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train_scaled, y_train)

In [None]:
y_pred = ada.predict(X_test_scaled)
y_proba = ada.predict_proba(X_test_scaled)[:,1]

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("AdaBoost Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 09. XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = xgb_model.predict(X_test_scaled)
y_proba = xgb_model.predict_proba(X_test_scaled)[:,1]

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("XGBoost Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 10. Random Forest Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
importances = rf.feature_importances_
feat_names = X_train.columns
feat_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Top Features — Random Forest")
plt.tight_layout()
plt.show()

In [None]:
print("Top 5 features (Random Forest):")
print(feat_imp.head())

# 11. XGBoost Feature Importance

In [None]:
importances = xgb_model.feature_importances_
feat_names = X_train.columns
feat_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Top Features — XGBoost")
plt.tight_layout()
plt.show()

In [None]:
print("Top 5 features (XGBoost):")
print(feat_imp.head())

# 12. Save Model & Scaler

In [None]:
import joblib

joblib.dump(rf, "rf_model.joblib")
joblib.dump(scaler, "scaler.joblib")