In [35]:
import scipy.sparse as ss
X = ss.load_npz("../data/X_chi2.npz")
Y = ss.load_npz("../data/Y.npz")

In [36]:

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from scipy.stats import randint
import numpy as np
import pandas as pd

In [37]:
# 劃分數據集
Y = np.ravel(Y.toarray())
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

X_train = X_train.toarray()
X_test = X_test.toarray()

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

# 定義 Native bayes 的參數範圍
nb_params = {}

# 建立 Native bayes 模型
nb_model = GaussianNB()

# 定義 K-fold 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 使用 GridSearchCV 尋找最佳參數
grid_search_nb = GridSearchCV(nb_model, nb_params, cv=kfold, scoring='accuracy')

grid_search_nb.fit(X_train, y_train)

# 打印最佳參數
print("Best Native Bayes Parameters:", grid_search_nb.best_params_)

# 使用最佳參數重新訓練模型
best_nb_model = grid_search_nb.best_estimator_
best_nb_model.fit(X_train, y_train)

y_pred_nb = best_nb_model.predict(X_test)
print("Native Bayes Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_nb))
print("Native Bayes Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_nb))

# 輸出Confusion Matiix
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:")
print(conf_matrix_nb)


Best Native Bayes Parameters: {}
Native Bayes Accuracy with Best Parameters: 0.6714801444043321
Native Bayes Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.60      0.95      0.73       263
         1.0       0.90      0.42      0.57       291

    accuracy                           0.67       554
   macro avg       0.75      0.68      0.65       554
weighted avg       0.76      0.67      0.65       554

Confusion Matrix:
[[250  13]
 [169 122]]


In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 定義 XGBoost 參數範圍
xgb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
}

# 建立 XGBoost 模型
xgb_model = XGBClassifier()

# 定義 K-fold 交叉驗證
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 使用 GridSearchCV 尋找最佳參數
grid_search_xgb = GridSearchCV(xgb_model, xgb_params, cv=kfold, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# 打印最佳參數
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)

# 使用最佳參數重新訓練模型
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train, y_train)

y_pred_xgb = best_xgb_model.predict(X_test)
print("XGBoost Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_xgb))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:")
print(conf_matrix)


Best XGBoost Parameters: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}
XGBoost Accuracy with Best Parameters: 0.6624548736462094
XGBoost Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.61      0.82      0.70       263
         1.0       0.76      0.52      0.62       291

    accuracy                           0.66       554
   macro avg       0.68      0.67      0.66       554
weighted avg       0.69      0.66      0.66       554

Confusion Matrix:
[[215  48]
 [139 152]]


In [43]:
# 隨機森林參數範圍
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 使用 RandomizedSearchCV 尋找最佳參數
random_search_rf = RandomizedSearchCV(RandomForestClassifier(), rf_params, n_iter=10, cv=kfold, scoring='accuracy', random_state=42)
random_search_rf.fit(X_train, y_train)

# 打印最佳參數
print("Best Random Forest Parameters:", random_search_rf.best_params_)

# 使用最佳參數重新訓練模型
best_rf_model = random_search_rf.best_estimator_
best_rf_model.fit(X_train, y_train)

y_pred_rf = best_rf_model.predict(X_test)
print("Random Forest Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_rf))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix)

Best Random Forest Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Random Forest Accuracy with Best Parameters: 0.6552346570397112
Random Forest Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.63      0.68      0.65       263
         1.0       0.69      0.63      0.66       291

    accuracy                           0.66       554
   macro avg       0.66      0.66      0.66       554
weighted avg       0.66      0.66      0.66       554

Confusion Matrix:
[[179  84]
 [107 184]]


In [42]:
# SVM 參數範圍
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

# 使用 RandomizedSearchCV 尋找最佳參數
random_search_svm = RandomizedSearchCV(SVC(), svm_params, n_iter=10, cv=kfold, scoring='accuracy', random_state=42)
random_search_svm.fit(X_train, y_train)

# 打印最佳參數
print("Best SVM Parameters:", random_search_svm.best_params_)

# 使用最佳參數重新訓練模型
best_svm_model = random_search_svm.best_estimator_
best_svm_model.fit(X_train, y_train)

y_pred_svm = best_svm_model.predict(X_test)
print("SVM Accuracy with Best Parameters:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report with Best Parameters:\n", classification_report(y_test, y_pred_svm))

# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(conf_matrix)

Best SVM Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}
SVM Accuracy with Best Parameters: 0.6805054151624549
SVM Classification Report with Best Parameters:
               precision    recall  f1-score   support

         0.0       0.62      0.84      0.71       263
         1.0       0.79      0.54      0.64       291

    accuracy                           0.68       554
   macro avg       0.70      0.69      0.68       554
weighted avg       0.71      0.68      0.67       554

Confusion Matrix:
[[220  43]
 [134 157]]


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# 定義Stacking模型
stacked_model = StackingClassifier(
    estimators=[
        ('xgboost', best_xgb_model),
        ('svm', best_svm_model),
        ('random_forest', best_rf_model),
        ('naive_bayes', best_nb_model)
    ],
    final_estimator=LogisticRegression()  
)


# 訓練Stacking模型
stacked_model.fit(X_train, y_train)

# 預測和評估Stacking模型
stacked_pred = stacked_model.predict(X_test)
print("Stacked Model Accuracy:", accuracy_score(y_test, stacked_pred))
print("Stacked Model Classification Report:\n", classification_report(y_test, stacked_pred))




Stacked Model Accuracy: 0.6805054151624549
Stacked Model Classification Report:
               precision    recall  f1-score   support

         0.0       0.62      0.82      0.71       263
         1.0       0.77      0.56      0.65       291

    accuracy                           0.68       554
   macro avg       0.70      0.69      0.68       554
weighted avg       0.70      0.68      0.68       554



In [45]:
# 輸出Confusion Matrix
conf_matrix = confusion_matrix(y_test, stacked_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[215  48]
 [129 162]]


In [48]:
import pickle

# 儲存成pickle檔
file_name = 'model.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(stacked_model, f)

In [49]:
# 使用範例

with open(file_name, 'rb') as f:
    model = pickle.load(f)