In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# 查看features
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names[:50]


In [None]:
from sklearn.model_selection import train_test_split
# 拆分 Validation & Train
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=8)

## Logistic regression

In [40]:
# 設定 grid
import numpy as np

grid = np.linspace(0.0001,1000,20)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
largest_F1 = -1
bestc = -1

# 找到最好的c
for i in grid:
    clf = LogisticRegression(random_state=0, C=i, max_iter=1000).fit(x_valid,np.ravel(y_valid))
    y_pred = clf.predict(x_test)
    F1 = f1_score(y_test, y_pred)
    
    # 找到最好的F1
    if F1 > largest_F1:
        bestc = i
        largest_F1 = F1

In [41]:
# 建立 Logistic regression 模型
clf = LogisticRegression(random_state=0, C=bestc, max_iter=1000).fit(tfidf_train_array,y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
from sklearn.metrics import precision_score
precicision = precision_score(y_test, y_pred)
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)

# 輸出各項資訊
print("bestc: ", bestc)
print("F1: ", largest_F1)
print("accuracy: ", accuracy)
print("recall: ", recall)
print("precicision: ", precicision)
coef = np.ravel(clf.coef_)
abs_coef = abs(np.ravel(clf.coef_))
coef_df = pd.DataFrame({"coef":coef, "feature":feature_names, "abs_coef":abs_coef})
coef_df = coef_df.sort_values(by='abs_coef', ascending=False)
print("\n前五十個權重：")
print(coef_df.head(50))

bestc:  157.89482105263158
F1:  0.8965517241379309
accuracy:  0.98125
recall:  0.961038961038961
precicision:  1.0

前五十個權重：
            coef feature   abs_coef
33414  12.410475      然而  12.410475
3818   12.180297      他們  12.180297
23352   9.617113     投資者   9.617113
41515   9.495834      能夠   9.495834
48757   8.752155      這一   8.752155
35839   8.482829      發展   8.482829
52262   8.123055      關注   8.123055
34      7.881541      一些   7.881541
19528   7.761840      帶來   7.761840
50      7.648649      一個   7.648649
19282   7.358026      市場   7.358026
24369   7.032255      提供   7.032255
20472   7.015514      引起   7.015514
49284   6.953579      通過   6.953579
3743   -6.475958      今年   6.475958
30672  -6.424496      每日   6.424496
18602  -6.387761      就是   6.387761
49197  -6.334506      通知   6.334506
27247   6.117511      更加   6.117511
31943   5.886122     消費者   5.886122
48526   5.747559      近日   5.747559
14564  -5.724426      報導   5.724426
11968   5.643634      同時   5.643634
51173   5.63

## RandomForestClassifier

In [42]:
grid = np.linspace(5,1000,10)

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
largest_F1 = -1
best_n = -1
for i in grid:
    clf = RandomForestClassifier(n_estimators=int(i), random_state=0)
    clf.fit(x_valid,np.ravel(y_valid))
    y_pred = clf.predict(x_test)
    F1 = f1_score(y_test, y_pred)
    if F1 > largest_F1:
        best_n = i
        largest_F1 = F1

In [44]:
# 執行 Random Forest
clf = RandomForestClassifier(n_estimators= int(best_n), random_state=0)
clf.fit(x_train,np.ravel(y_train))
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
from sklearn.metrics import precision_score
precicision = precision_score(y_test, y_pred)
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
print("n_estimators: ", best_n)
print("F1: ", largest_F1)
print("accuracy: ", accuracy)
print("recall: ", recall)
print("precicision: ", precicision)
important = np.ravel(clf.feature_importances_)
coef_df = pd.DataFrame({"importance":important, "feature":feature_names})
coef_df = coef_df.sort_values(by='importance', ascending=False)
print("\n前二十個權重：")
print(coef_df.head(20))

n_estimators:  115.55555555555556
F1:  0.9387755102040817
accuracy:  0.9625
recall:  0.974025974025974
precicision:  0.9493670886075949

前二十個權重：
       importance feature
41515    0.010833      能夠
3818     0.010700      他們
33414    0.010316      然而
3743     0.009078      今年
52262    0.008443      關注
30672    0.007562      每日
30433    0.007542      此外
43600    0.007492      行業
49284    0.007094      通過
23352    0.006921     投資者
51173    0.006823      重要
49197    0.006568      通知
50       0.006518      一個
19528    0.006111      帶來
48877    0.006050      這些
14564    0.005937      報導
11968    0.005855      同時
20322    0.005432      廣泛
329      0.005218      一直
22717    0.004850      所以


## GradientBoosting Classifier

In [45]:
# 設定參數
Learning_rate_array = [0.1, 0.5, 1]
n_estimater = 1500

In [46]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

best_lr = -1
best_stg_final = -1
best_F1_final = -1

# 尋找最佳的Learning rate
for i in Learning_rate_array:
    
    # 跑模型
    clf = GradientBoostingClassifier(n_estimators=1500, learning_rate=i).fit(x_valid, np.ravel(y_valid))
    y_stage_pred = clf.staged_predict(x_test)
    m = 0
    largest_F1 = -1
    best_nstg = -1
    F1_list = []
    for y_pred in y_stage_pred:
        F1 = f1_score(y_test,y_pred)
        F1_list.append(F1)
        if F1 > largest_F1:
            best_nstg = m
            largest_F1 = F1
        m += 1
    
    # 更新各項參數
    if largest_F1 > best_F1_final:
        best_F1_final = largest_F1
        best_lr = i
        best_stg_final = best_nstg

# 輸出預測能力
print("best_F1: ", best_F1_final)
print("best_lr: ",best_lr)
print("best__stage: ",best_stg_final)

best_F1:  0.9241379310344828
best_lr:  0.1
best__stage:  212


In [47]:
# 執行GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=best_nstg, learning_rate=best_lr).fit(x_train, np.ravel(y_train))
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
from sklearn.metrics import precision_score
precicision = precision_score(y_test, y_pred)
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)

# 輸出各項資訊
print("F1: ", largest_F1)
print("accuracy: ", accuracy)
print("recall: ", recall)
print("precicision: ", precicision)
important = np.ravel(clf.feature_importances_)
coef_df = pd.DataFrame({"importance":important, "feature":feature_names})
coef_df = coef_df.sort_values(by='importance', ascending=False)
print("\n前二十個權重：")
print(coef_df.head(20))

F1:  0.9090909090909091
accuracy:  0.9625
recall:  0.948051948051948
precicision:  0.9733333333333334

前二十個權重：
       importance feature
33414    0.338456      然而
30672    0.064730      每日
52262    0.064460      關注
41515    0.054526      能夠
19528    0.053569      帶來
3818     0.049983      他們
49197    0.034050      通知
49284    0.030958      通過
3743     0.028416      今年
48757    0.027188      這一
48526    0.021771      近日
54909    0.018625      首先
14564    0.016893      報導
35839    0.016738      發展
30433    0.012247      此外
17890    0.012171     將繼續
34826    0.012111      甚至
934      0.011322      下午
20322    0.010848      廣泛
48877    0.008944      這些


## Stacking

In [48]:
# Level 0 Classifier build and conduct
clf_Lo = LogisticRegression(random_state=0, C=bestc,max_iter=1000).fit(x_train,np.ravel(y_train))
y_pred_logistic = clf_Lo.predict_proba(x_valid)

clf_RF = RandomForestClassifier(n_estimators= int(best_n), random_state=0)
clf_RF.fit(x_train,np.ravel(y_train))
y_pred_RF = clf_RF.predict_proba(x_valid)

clf_GB = GradientBoostingClassifier(n_estimators=best_nstg, learning_rate=best_lr).fit(x_train, np.ravel(y_train))
y_pred_GBC = clf_GB.predict_proba(x_valid)

In [49]:
# 合併不同的模型產出
import pandas as pd
y_pred_df = pd.DataFrame({"logistic":y_pred_logistic[:,1], "Random_Forest":y_pred_RF[:,1], "Gradient_Boosting":y_pred_GBC[:,1]})
print(y_pred_df)

# Level 1 Classifier
clf_Level_1 = LogisticRegression(random_state=0, penalty=None).fit(y_pred_df,np.ravel(y_valid))

     logistic  Random_Forest  Gradient_Boosting
0    0.051446       0.469565           0.195610
1    0.468895       0.486957           0.814278
2    0.965070       0.843478           0.949794
3    0.015741       0.408696           0.084419
4    0.039434       0.330435           0.453020
..        ...            ...                ...
139  0.001700       0.121739           0.022639
140  0.991596       0.869565           0.958049
141  0.010940       0.095652           0.045092
142  0.951066       0.773913           0.913031
143  0.014851       0.278261           0.179889

[144 rows x 3 columns]


In [50]:
# 測試模型成效
a = x_test
y_pred_logistic = clf_Lo.predict_proba(a)
y_pred_RF = clf_RF.predict_proba(a)
y_pred_GBC = clf_GB.predict_proba(a)
y_pred_df = pd.DataFrame({"logistic":y_pred_logistic[:,1], "Random_Forest":y_pred_RF[:,1], "Gradient_Boosting":y_pred_GBC[:,1]})
y_pred = clf_Level_1.predict(y_pred_df)

In [51]:
# 輸出accuracy recall precision等
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
F1_s = f1_score(y_pred,y_test)

print("")
print("bestc: ", bestc)
print("F1: ", F1_s)
print("accuracy: ", accuracy)
print("recall: ", recall)
print("precision: ", precision)
coef = np.ravel(clf_Level_1.coef_)
print("coeificient of stacking: ", coef)


bestc:  157.89482105263158
F1:  0.9934640522875817
accuracy:  0.99375
recall:  0.987012987012987
precision:  1.0
coeificient of stacking:  [ 3.37007952 14.61288773  1.60352663]
