In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Tải dữ liệu từ liên kết và chuẩn bị dữ liệu cho mô hình

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, KFold

# Tải dữ liệu từ liên kết
url = "/content/drive/MyDrive/SVM/online_shoppers_intention.csv"
data = pd.read_csv(url)

# Kiểm tra các giá trị null
print(data.isnull().sum())

# Loại bỏ các hàng trùng lặp
data.drop_duplicates(inplace=True)

# Loại bỏ hai cột 'OperatingSystems' và 'Browser'
data = data.drop(columns=['OperatingSystems', 'Browser'])

# Chia dữ liệu thành features và labels
X = data.drop('Revenue', axis=1)
y = data['Revenue']

# Chuyển đổi các biến phân loại thành biến số
X = pd.get_dummies(X)

# Xem thông tin dữ liệu
print(data.info())
data.head()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 12205 entries, 0 to 12329
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12205 non-null  int64  
 1   Administrative_Duration  12205 non-null  float64
 2   Informational            12205 non-null  int64  
 3   Informational_Duration   12205 non-null  float64
 4   ProductRelated           12205 n

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,1,4,Returning_Visitor,True,False


**Tiền xử lý dữ liệu**


chuẩn bị dữ liệu trước khi đưa vào mô hình

In [3]:
from sklearn.preprocessing import LabelEncoder
#Chuyển về dữ liệu số
# Using LabelEncoder 'VisitorType' column
data['VisitorType'] = LabelEncoder().fit_transform(data['VisitorType'])

# One-Hot Encoding for 'Month' column
data = pd.get_dummies(data, columns=['Month'], drop_first=True)

# Convert 'Weekend' and 'Revenue' columns from bool to int
data['Weekend'] = data['Weekend'].astype(int)
data['Revenue'] = data['Revenue'].astype(int)

# Dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12205 entries, 0 to 12329
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12205 non-null  int64  
 1   Administrative_Duration  12205 non-null  float64
 2   Informational            12205 non-null  int64  
 3   Informational_Duration   12205 non-null  float64
 4   ProductRelated           12205 non-null  int64  
 5   ProductRelated_Duration  12205 non-null  float64
 6   BounceRates              12205 non-null  float64
 7   ExitRates                12205 non-null  float64
 8   PageValues               12205 non-null  float64
 9   SpecialDay               12205 non-null  float64
 10  Region                   12205 non-null  int64  
 11  TrafficType              12205 non-null  int64  
 12  VisitorType              12205 non-null  int64  
 13  Weekend                  12205 non-null  int64  
 14  Revenue                  12

In [4]:
bool_columns = ['Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June',
                'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']

data[bool_columns] = data[bool_columns].astype(int)


# Dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12205 entries, 0 to 12329
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12205 non-null  int64  
 1   Administrative_Duration  12205 non-null  float64
 2   Informational            12205 non-null  int64  
 3   Informational_Duration   12205 non-null  float64
 4   ProductRelated           12205 non-null  int64  
 5   ProductRelated_Duration  12205 non-null  float64
 6   BounceRates              12205 non-null  float64
 7   ExitRates                12205 non-null  float64
 8   PageValues               12205 non-null  float64
 9   SpecialDay               12205 non-null  float64
 10  Region                   12205 non-null  int64  
 11  TrafficType              12205 non-null  int64  
 12  VisitorType              12205 non-null  int64  
 13  Weekend                  12205 non-null  int64  
 14  Revenue                  12

In [5]:
# Feature and label extraction
X = data.drop('Revenue', axis=1)
y = data['Revenue']

**Huấn luyện mô hình SVM và đánh giá bằng F1-score**


sử dụng cross-validation với KFold
'linear': Sử dụng siêu phẳng để phân chia dữ liệu.
'rbf' (Radial Basis Function): Sử dụng các hàm Gauss để tạo ra ranh giới phi tuyến.
'poly': Kernel đa thức.
'sigmoid': Sử dụng hàm sigmoid, thường tương tự như một mạng nơ-ron.

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold


# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True)

#chia dữ liệu đào tạo thành train - val
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=42, shuffle = True)

# ## Khử nhiễu
# # Tính toán Q1 (25%) và Q3 (75%)
# Q1 = x_train.quantile(0.25)
# Q3 = x_train.quantile(0.75)

# # Tính toán IQR
# IQR = Q3 - Q1

# # Lọc các giá trị ngoại lệ dựa trên IQR
# x_train_filtered = x_train[~((df < (Q1 - 1.5 * IQR)) | (x_train > (Q3 + 1.5 * IQR))).any(axis=1)]

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# svm_model = SVC()
# # Tạo danh sách các siêu tham số để tìm kiếm
# # param_grid = {
# #     'C': [0.1, 1, 10, 100],
# #     'gamma': [1, 0.1, 0.01, 0.001],
# #     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
# # }
# # kernel='linear', random_state=42
# param_grid = {
#     'C': [0.1, 1, 10],
#     'gamma': [1, 0.1, 0.01],
#     'kernel': ['linear']
# }
# # GridSearchCV để tìm siêu tham số tối ưu
# grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, scoring=make_scorer(f1_score, average='weighted'))
# grid.fit(X_train, y_train)

# print("best param : ", grid.best_params_)

# Tạo model
# svm_model_best = SVC(**grid.best_params_)
# best param :  {'C': 1, 'gamma': 1, 'kernel': 'linear'}
svm_model_best = SVC(C=1, gamma=1, kernel='linear')
svm_model_best.fit(X_train, y_train)

# KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Using F1-score as a metric
f1_scorer = make_scorer(f1_score, average='weighted')

# Calculate F1-score via cross-validation
scores = cross_val_score(svm_model_best, X_train, y_train, cv=kf, scoring=f1_scorer)
# Print average F1-score
print(f'Average F1-score: {scores.mean()}')

Average F1-score: 0.8720103749193904


Đánh giá

In [7]:
X_train.shape , X_val.shape, X_test.shape

((5980, 23), (2563, 23), (3662, 23))

In [8]:
from sklearn.metrics import classification_report

y_pred_train = svm_model_best.predict(X_train)
print("Train: ", classification_report(y_train,y_pred_train, target_names = ['False', 'True']   ))

Train:                precision    recall  f1-score   support

       False       0.90      0.97      0.94      5044
        True       0.75      0.42      0.54       936

    accuracy                           0.89      5980
   macro avg       0.83      0.70      0.74      5980
weighted avg       0.88      0.89      0.87      5980



In [9]:
from sklearn.metrics import classification_report
y_pred_val = svm_model_best.predict(X_val)
print("Train: ", classification_report(y_val,y_pred_val, target_names = ['False', 'True']   ))



Train:                precision    recall  f1-score   support

       False       0.96      0.42      0.59      2137
        True       0.24      0.92      0.38       426

    accuracy                           0.51      2563
   macro avg       0.60      0.67      0.49      2563
weighted avg       0.84      0.51      0.56      2563



In [10]:
from sklearn.metrics import classification_report
y_pred_test = svm_model_best.predict(X_test)
print("Train: ", classification_report(y_test,y_pred_test, target_names = ['False', 'True']   ))

Train:                precision    recall  f1-score   support

       False       0.90      0.97      0.94      3116
        True       0.74      0.40      0.52       546

    accuracy                           0.89      3662
   macro avg       0.82      0.69      0.73      3662
weighted avg       0.88      0.89      0.88      3662



In [11]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# # Plot F1-score chart
# sns.barplot(x=[f"Fold {i+1}" for i in range(len(f1_scores))], y=f1_scores)
# plt.title('F1-score across K-Folds')
# plt.ylabel('F1-score')
# plt.ylim(0, 1)
# plt.show()

**Logistic Regression và Random Forest**

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Khởi tạo các mô hình
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Đánh giá mô hình Logistic Regression
f1_scores_logistic = cross_val_score(logistic_model, X, y, cv=kf, scoring=make_scorer(f1_score))
print(f"Trung bình F1-score của mô hình Logistic Regression: {f1_scores_logistic.mean()}")

# Đánh giá mô hình Random Forest
f1_scores_rf = cross_val_score(rf_model, X, y, cv=kf, scoring=make_scorer(f1_score))
print(f"Trung bình F1-score của mô hình Random Forest: {f1_scores_rf.mean()}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Trung bình F1-score của mô hình Logistic Regression: 0.5082570236477351
Trung bình F1-score của mô hình Random Forest: 0.6408170883153186
