In [11]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC


In [13]:
df = pd.read_csv('../.csv/ad_click_dataset.csv')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB
None
     id full_name   age      gender device_type ad_position browsing_history  \
0   670   User670  22.0         NaN     Desktop         Top         Shopping   
1  3044  User3044   NaN        Male     Desktop         Top              NaN   
2  5912  User5912  41.0  Non-Binary         NaN        Side        

In [14]:
print(df.isnull().sum())

id                     0
full_name              0
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
dtype: int64


In [15]:
# Preencher idade faltante com a mediana
df['age'] = df['age'].fillna(df['age'].median())

# Preencher browsing_history e time_of_day com 'Unknown'
df['browsing_history'] = df['browsing_history'].fillna('Unknown')
df['time_of_day'] = df['time_of_day'].fillna('Unknown')

In [16]:


categorical_cols = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)



In [17]:
X = df_encoded.drop(['id', 'full_name', 'click'], axis=1)
y = df_encoded['click']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:

# Encontrar os melhores parâmetros
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1, 1, 10],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Melhores parâmetros:", grid.best_params_)

Melhores parâmetros: {'C': 10, 'gamma': 10, 'kernel': 'rbf'}


In [20]:
# Use o melhor modelo encontrado pelo GridSearchCV
best_svm = grid.best_estimator_
y_pred = best_svm.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[417 288]
 [322 973]]
              precision    recall  f1-score   support

           0       0.56      0.59      0.58       705
           1       0.77      0.75      0.76      1295

    accuracy                           0.69      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.70      0.69      0.70      2000

