In [1]:
import scipy.sparse as ss
X = ss.load_npz("../data/X_1day.npz")
Y = ss.load_npz("../data/Y_1day.npz")

In [2]:

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from scipy.stats import randint
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [3]:
# 劃分數據集
Y = np.ravel(Y.toarray())
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

X_train = X_train.toarray()
X_test = X_test.toarray()

from sklearn.model_selection import GridSearchCV, StratifiedKFold

# 定義 K-fold 交叉驗證
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# 定義 Native bayes 的參數範圍
nb_params = {}

# 建立 Native bayes 模型
nb_model = GaussianNB()

# 使用 GridSearchCV 尋找最佳參數
grid_search_nb = GridSearchCV(nb_model, nb_params, cv=kfold, scoring='accuracy')

grid_search_nb.fit(X_train, y_train)

# 打印最佳參數
print("Best Native Bayes Parameters:", grid_search_nb.best_params_)

# 使用最佳參數重新訓練模型
best_nb_model = grid_search_nb.best_estimator_
best_nb_model.fit(X_train, y_train)

y_pred_nb = best_nb_model.predict(X_test)
print("Native Bayes Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_nb))
print("Native Bayes Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_nb))

# 輸出Confusion Matiix
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:")
print(conf_matrix_nb)


Best Native Bayes Parameters: {}
Native Bayes Accuracy with Best Parameters: 0.6441717791411042
Native Bayes Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.88      0.33      0.48       244
         1.0       0.59      0.96      0.73       245

    accuracy                           0.64       489
   macro avg       0.73      0.64      0.61       489
weighted avg       0.73      0.64      0.61       489

Confusion Matrix:
[[ 81 163]
 [ 11 234]]


In [5]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 定義 XGBoost 參數範圍
xgb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
}

# 建立 XGBoost 模型
xgb_model = XGBClassifier()

# 使用 GridSearchCV 尋找最佳參數
grid_search_xgb = GridSearchCV(xgb_model, xgb_params, cv=kfold, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# 打印最佳參數
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)

# 使用最佳參數重新訓練模型
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train, y_train)

y_pred_xgb = best_xgb_model.predict(X_test)
print("XGBoost Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_xgb))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:")
print(conf_matrix)


Best XGBoost Parameters: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}
XGBoost Accuracy with Best Parameters: 0.6666666666666666
XGBoost Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.74      0.51      0.61       244
         1.0       0.63      0.82      0.71       245

    accuracy                           0.67       489
   macro avg       0.68      0.67      0.66       489
weighted avg       0.68      0.67      0.66       489

Confusion Matrix:
[[125 119]
 [ 44 201]]


In [6]:
# 隨機森林參數範圍
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 使用 RandomizedSearchCV 尋找最佳參數
random_search_rf = RandomizedSearchCV(RandomForestClassifier(), rf_params, n_iter=10, cv=kfold, scoring='accuracy', random_state=42)
random_search_rf.fit(X_train, y_train)

# 打印最佳參數
print("Best Random Forest Parameters:", random_search_rf.best_params_)

# 使用最佳參數重新訓練模型
best_rf_model = random_search_rf.best_estimator_
best_rf_model.fit(X_train, y_train)

y_pred_rf = best_rf_model.predict(X_test)
print("Random Forest Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_rf))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix)

Best Random Forest Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Random Forest Accuracy with Best Parameters: 0.6830265848670757
Random Forest Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.74      0.57      0.64       244
         1.0       0.65      0.80      0.72       245

    accuracy                           0.68       489
   macro avg       0.69      0.68      0.68       489
weighted avg       0.69      0.68      0.68       489

Confusion Matrix:
[[138 106]
 [ 49 196]]


In [7]:
# SVM 參數範圍
from scipy.sparse import csr_matrix

svm_params = {
    'C': [0.1],
    #'kernel': ['linear', 'rbf', 'poly'],
    'kernel': ['rbf'],
    'gamma': ['scale'],
}

# 使用 RandomizedSearchCV 尋找最佳參數
random_search_svm = RandomizedSearchCV(SVC(), svm_params, n_iter=10, cv=kfold, scoring='accuracy', random_state=42)
random_search_svm.fit(X_train, y_train)

# 打印最佳參數
print("Best SVM Parameters:", random_search_svm.best_params_)

# 使用最佳參數重新訓練模型
best_svm_model = random_search_svm.best_estimator_
best_svm_model.fit(X_train, y_train)

y_pred_svm = best_svm_model.predict(X_test)
print("SVM Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_svm))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(conf_matrix)



Best SVM Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 0.1}
SVM Accuracy with Best Parameters: 0.5296523517382413
SVM Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.66      0.12      0.20       244
         1.0       0.52      0.94      0.67       245

    accuracy                           0.53       489
   macro avg       0.59      0.53      0.43       489
weighted avg       0.59      0.53      0.43       489

Confusion Matrix:
[[ 29 215]
 [ 15 230]]


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# 定義Stacking模型
stacked_model = StackingClassifier(
    estimators=[
        ('xgboost', best_xgb_model),
        ('svm', best_svm_model),
        ('random_forest', best_rf_model),
        ('naive_bayes', best_nb_model)
    ],
    final_estimator=LogisticRegression()  
)


# 訓練Stacking模型
stacked_model.fit(X_train, y_train)

# 預測和評估Stacking模型
stacked_pred = stacked_model.predict(X_test)
print("Stacked Model Accuracy:", accuracy_score(y_test, stacked_pred))
print("Stacked Model Classification Report:\n", classification_report(y_test, stacked_pred))




Stacked Model Accuracy: 0.7075664621676891
Stacked Model Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      0.55      0.65       244
         1.0       0.66      0.86      0.75       245

    accuracy                           0.71       489
   macro avg       0.73      0.71      0.70       489
weighted avg       0.73      0.71      0.70       489



In [9]:
# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, stacked_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[135 109]
 [ 34 211]]


In [10]:
import pickle

# 儲存成pickle檔
file_name = 'model.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(stacked_model, f)

In [11]:
# 使用範例

with open(file_name, 'rb') as f:
    model = pickle.load(f)