RF 及 XGBoost 的 feature important , 重要比重及繪圖 2. RF , Adaboost 及 XGBoost 建立模型, 混淆矩陣, Accuracy, Precision, Recall, F1, AUC, ROC 繳交程式與Excel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/大三/下學期/機器學習應用/Datasets/ Bank Marketing")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering import
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder ,OrdinalEncoder
from imblearn.over_sampling import SMOTE

# Model import
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance

# Evaluate import
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score ,roc_curve ,classification_report, confusion_matrix ,precision_score, recall_score, f1_score

# sns.set(style="whitegrid")
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 2)
plt.rcParams['axes.unicode_minus'] = False # 正常顯示負號

# EDA
讀取資料，做資料探索

In [None]:
df = pd.read_csv("./bank-full.csv" ,sep =';')

# df # 45211 rows × 17 columns

In [None]:
df.info()

In [None]:
# Numerical statistics
df.describe()

In [None]:
# Object statistics
df.describe(include=['object'])

In [None]:
numerical_columns = df.select_dtypes(include='number').columns

plt.figure(figsize=(14, 8))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(2, 4, i)
    sns.histplot(df[column], bins=20, kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.subplot(1 ,2 ,1)
sns.countplot(data =df ,x ='y')
plt.xticks(rotation=45)
plt.title('y Count')

plt.subplot(1 ,2 ,2)
profile_counts = df['y'].value_counts()
plt.pie(profile_counts, labels = profile_counts.index, autopct='%1.1f%%')
plt.title('y Percentage')
plt.tight_layout()
plt.show()

print(df['y'].value_counts()) # No:39922(88.3%) ,Yes:5289(11.7%)

得圖表和輸出內容得知,No:39922(88.3%) ,Yes:5289(11.7%)

# Processing Data
刪除特定欄位,做異常值處理

In [None]:
delete_columns = ['poutcome']
df = df.drop(delete_columns, axis=1)

df = df.drop(4)

In [None]:
numerical_columns = df.select_dtypes(include='number').columns

for column in numerical_columns:
  means = df[column].mean()
  stds = df[column].std()

  lower_bound =  means - 3 * stds
  upper_bound =  means + 3 * stds

  df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
df # 45211 rows × 17 columns ---> 40339 rows × 16 columns

In [None]:
plt.figure(figsize=(10, 6))
plt.subplot(1 ,2 ,1)
sns.countplot(data =df ,x ='y')
plt.xticks(rotation=45)
plt.title('y Count')

plt.subplot(1 ,2 ,2)
profile_counts = df['y'].value_counts()
plt.pie(profile_counts, labels = profile_counts.index, autopct='%1.1f%%')
plt.title('y Percentage')
plt.tight_layout()
plt.show()

print(df['y'].value_counts()) # No:36273(89.9%) ,Yes:4066(10.1%)

# Feature Engineering
將特徵和標籤做特徵工程轉換

In [None]:
# 標籤y做LabelEncoder
lb_encoder = LabelEncoder()
df['y'] = lb_encoder.fit_transform(df['y'])
# Original values for column 'y': ['no' 'yes']
# Encoded values for column 'y': [0 1]

# 類別特徵欄位做OrdinalEncoder
cat_columns = df.select_dtypes(include='object').columns
or_encoder = OrdinalEncoder()
df[cat_columns] = or_encoder.fit_transform(df[cat_columns])

# Train Test Split
資料分割，將資料集拆分為訓練集和測試集

In [None]:
X = df.drop(columns=['y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', X_train.shape)
print('y_test:', X_test.shape)
print('='*30)
# 查看全部資料的類別比例
print(pd.Series(y).value_counts(normalize=True))
# 查看訓練集標籤的分佈
print(pd.Series(y_train).value_counts(normalize=True))
# 查看測試集的標籤分佈
print(pd.Series(y_test).value_counts(normalize=True))

In [None]:
from collections import Counter

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 25392, 1: 2845})


Testing target statistics: Counter({0: 10881, 1: 1221})

# SMOTE
SMOTE處理資料不平衡

In [None]:
smote = SMOTE(random_state=42)
X_resample ,y_resample = smote.fit_resample(X_train, y_train)

In [None]:
resampled_data = pd.concat([X_resample, pd.Series(y_resample, name='y')], axis=1)

resampled_data # 50784 rows × 16 columns

In [None]:
plt.figure(figsize=(10, 6))
plt.subplot(1 ,2 ,1)
sns.countplot(data =resampled_data ,x ='y')
plt.xticks(rotation=45)
plt.title('y Count')

plt.subplot(1 ,2 ,2)
profile_counts = resampled_data['y'].value_counts()
plt.pie(profile_counts, labels = profile_counts.index, autopct='%1.1f%%')
plt.title('y Percentage')
plt.tight_layout()
plt.show()

print(resampled_data['y'].value_counts()) # No:25392(50%) ,Yes:25392(50%)

In [None]:
X_resample = resampled_data.drop('y' ,axis =1)
y_resample = resampled_data['y']

X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.3, random_state=42)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', X_train.shape)
print('y_test:', X_test.shape)
print('='*30)
# 查看訓練集標籤的分佈
print(pd.Series(y_train).value_counts(normalize=True))
# 查看測試集的標籤分佈
print(pd.Series(y_test).value_counts(normalize=True))

In [None]:
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({1: 17781, 0: 17767})

Testing target statistics: Counter({0: 7625, 1: 7611})

# Feature_Importances
找出重要特徵

In [None]:
rf_model = RandomForestClassifier(n_estimators=100 ,random_state=42)

rf_model.fit(X_train, y_train)

importances = rf_model.feature_importances_
feature_scores = pd.Series(rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_scores)

plt.figure(figsize=(10, 6))
feature_scores.plot(kind='bar', color='skyblue')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.xticks(rotation=60)
plt.tight_layout()
plt.show()

In [None]:
xgb_model = XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

print(xgb_model.feature_importances_)
plot_importance(xgb_model)
plt.show()

In [None]:
# SMOYE_before
# RandomForestClassifier 、AdaBoostClassifier、XGBClassifier 前50%Feature均相同
selected_features = ['duration' ,'balance' ,'age' ,'month' ,'day' ,'pdays' ,'job']
X = df[selected_features]
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', X_train.shape)
print('y_test:', X_test.shape)
print('='*30)
# 查看訓練集標籤的分佈
print(pd.Series(y_train).value_counts(normalize=True))
# 查看測試集的標籤分佈
print(pd.Series(y_test).value_counts(normalize=True))

In [None]:
# SMOTE_after
# RandomForestClassifier 、AdaBoostClassifier
selected_features = ['duration' ,'housing' ,'contact' ,'education' ,'marital' ,'month' ,'job']
X = resampled_data[selected_features]
y = resampled_data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', X_train.shape)
print('y_test:', X_test.shape)
print('='*30)
# 查看訓練集標籤的分佈
print(pd.Series(y_train).value_counts(normalize=True))
# 查看測試集的標籤分佈
print(pd.Series(y_test).value_counts(normalize=True))

In [None]:
# SMOTE_after
# XGBClassifier
selected_features = ['duration' ,'balance' ,'age' ,'month' ,'day' ,'pdays' ,'job']
X = resampled_data[selected_features]
y = resampled_data['y']

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X, y, test_size=0.3, random_state=42)

# RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100 ,random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
def evaluate_model_performance(model, X_train, X_test, y_train, y_test):
    # 訓練集上的模型評分
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_conf_matrix = confusion_matrix(y_train, y_train_pred)
    train_classification_report = classification_report(y_train, y_train_pred)

    print("Training Set Evaluation:")
    print("Accuracy:", train_accuracy)
    print("Confusion Matrix:")
    print(train_conf_matrix)
    print("Classification Report:")
    print(train_classification_report)
    print("\n")

    # 測試集上的模型評分
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)
    test_classification_report = classification_report(y_test, y_test_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)

    print("Testing Set Evaluation:")
    print("Accuracy:", test_accuracy)
    print("Confusion Matrix:")
    print(test_conf_matrix)
    print("Classification Report:")
    print(test_classification_report)
    print("AUC:")
    print(test_auc)
evaluate_model_performance(rf_model, X_train, X_test, y_train, y_test)

# AdaBoostClassifier

In [None]:
# 決策樹當基模型
base_model = DecisionTreeClassifier(random_state=42)

ada_model = AdaBoostClassifier(estimator=base_model ,random_state=42 ,n_estimators=500)
ada_model.fit(X_train, y_train)

In [None]:
evaluate_model_performance(ada_model, X_train, X_test, y_train, y_test)

# XGBoostClassifier

In [None]:
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
evaluate_model_performance(xgb_model, X_train, X_test, y_train, y_test)

In [None]:
xgb_model.fit(X_train_xgb, y_train_xgb)
evaluate_model_performance(xgb_model, X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb)

# ROC AUC

In [None]:
# SMOYE_before
plt.figure(figsize=(10, 8))

xgb_y_score = xgb_model.predict_proba(X_test)[:, 1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_y_score)
roc_auc_xgb = roc_auc_score(y_test, xgb_y_score)
plt.plot(fpr_xgb, tpr_xgb, color='blue', lw=2, label=f'XGBoost (AUC = {roc_auc_xgb:.2f})')

rf_y_score = rf_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_y_score)
roc_auc_rf = roc_auc_score(y_test, rf_y_score)
plt.plot(fpr_rf, tpr_rf, color='purple', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')

ada_y_score = ada_model.predict_proba(X_test)[:, 1]
fpr_ada, tpr_ada, _ = roc_curve(y_test, ada_y_score)
roc_auc_ada = roc_auc_score(y_test, ada_y_score)
plt.plot(fpr_ada, tpr_ada, color='black', lw=2, label=f'AdaBoost (AUC = {roc_auc_ada:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# SMOYE_after
plt.figure(figsize=(10, 8))

xgb_y_score = xgb_model.predict_proba(X_test_xgb)[:, 1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test_xgb, xgb_y_score)
roc_auc_xgb = roc_auc_score(y_test_xgb, xgb_y_score)
plt.plot(fpr_xgb, tpr_xgb, color='blue', lw=2, label=f'XGBoost (AUC = {roc_auc_xgb:.2f})')

rf_y_score = rf_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_y_score)
roc_auc_rf = roc_auc_score(y_test, rf_y_score)
plt.plot(fpr_rf, tpr_rf, color='purple', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')

ada_y_score = ada_model.predict_proba(X_test)[:, 1]
fpr_ada, tpr_ada, _ = roc_curve(y_test, ada_y_score)
roc_auc_ada = roc_auc_score(y_test, ada_y_score)
plt.plot(fpr_ada, tpr_ada, color='black', lw=2, label=f'AdaBoost (AUC = {roc_auc_ada:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()