## 情緒分數相關性分析

### 載入套件

In [30]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from time import time
from sklearn import metrics
import scikitplot as skplt
import matplotlib.pyplot as plt

### 讀檔

In [31]:
## cvaw + stock sign
filePath = "./preprocess/tokens+stockSign/"
df_cvaw_covid = pd.read_csv(filePath + "cvaw_covid_stockSign.csv")[["Valence_Sum","Arousal_Sum", "words_Num"]]
df_cvaw_stock = pd.read_csv(filePath + "cvaw_stock_stockSign.csv")[["Valence_Sum","Arousal_Sum", "words_Num"]]
df_stockSign_covid = pd.read_csv(filePath + "cvaw_covid_stockSign.csv")[["date", "stockRise_mask", "stockRise_testKits", "stockRise_vaccine"]]
df_stockSign_stock = pd.read_csv(filePath + "cvaw_stock_stockSign.csv")[["date", "stockRise_mask", "stockRise_testKits", "stockRise_vaccine"]]

In [32]:
df_cvaw_covid["weighted_score"] = np.where(df_cvaw_covid["words_Num"] == 0, 0, 
                                           (df_cvaw_covid["Valence_Sum"] * (df_cvaw_covid["Arousal_Sum"] + df_cvaw_covid["words_Num"] * 5) / df_cvaw_covid["words_Num"]))

In [33]:
df_cvaw_covid

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score
0,-16.6,-18.8,34,-73.821176
1,23.3,-6.2,69,114.406377
2,-76.0,-50.7,125,-349.174400
3,11.2,-10.0,29,52.137931
4,-13.3,-21.2,41,-59.622927
...,...,...,...,...
7509,2.0,-7.6,8,8.100000
7510,-32.8,-0.8,47,-163.441702
7511,0.6,-1.4,4,2.790000
7512,-10.7,-18.2,29,-46.784828


In [34]:
df_cvaw_stock["weighted_score"] = np.where(df_cvaw_stock["words_Num"] == 0, 0, 
                                           (df_cvaw_stock["Valence_Sum"] * (df_cvaw_stock["Arousal_Sum"] + df_cvaw_stock["words_Num"] * 5) / df_cvaw_stock["words_Num"]))

In [35]:
df_cvaw_stock

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score
0,14.0,-2.4,21,68.400000
1,15.4,-10.2,22,69.860000
2,33.0,-21.3,51,151.217647
3,-5.2,-6.8,8,-21.580000
4,-15.1,-13.9,32,-68.940937
...,...,...,...,...
9724,-0.5,-6.1,14,-2.282143
9725,-2.2,0.0,4,-11.000000
9726,0.0,-3.4,4,0.000000
9727,5.4,-3.2,7,24.531429


### covid 板 testing & training split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(df_cvaw_covid, df_stockSign_covid, test_size=0.20, random_state=404)

In [37]:
x_train

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score
7271,-8.4,-5.6,8,-36.120000
317,-4.4,-13.2,28,-19.925714
1925,-0.6,-3.4,3,-2.320000
3188,11.8,-10.2,17,51.920000
7289,-17.8,-2.6,20,-86.686000
...,...,...,...,...
6031,-1.2,-40.0,56,-5.142857
5108,-5.8,-3.0,54,-28.677778
5302,8.6,-5.5,13,39.361538
5994,-9.3,-5.1,12,-42.547500


In [38]:
# 檢查切出來的 x_train y_train index 是否相同
for i in range(len(x_train.index)):
    if x_train.index[i] != y_train.index[i]:
        print(x_train.index[i])

In [39]:
x_train = x_train.values.tolist()
x_test = x_test.values.tolist()
y_train_mask = y_train["stockRise_mask"].tolist()    # 單括號取list，雙括號取dataframe
y_train_testKits = y_train["stockRise_testKits"].tolist()
y_train_vaccine = y_train["stockRise_vaccine"].tolist()
y_test_mask = y_test["stockRise_mask"].tolist()
y_test_testKits = y_test["stockRise_testKits"].tolist()
y_test_vaccine = y_test["stockRise_vaccine"].tolist()

### stock 板 testing & training split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(df_cvaw_stock, df_stockSign_stock, test_size=0.20, random_state=404)

In [46]:
# 檢查切出來的 x_train y_train index 是否相同
for i in range(len(x_train.index)):
    if x_train.index[i] != y_train.index[i]:
        print(x_train.index[i])

In [47]:
x_train = x_train.values.tolist()
x_test = x_test.values.tolist()
y_train_mask = y_train["stockRise_mask"].tolist()    # 單括號取list，雙括號取dataframe
y_train_testKits = y_train["stockRise_testKits"].tolist()
y_train_vaccine = y_train["stockRise_vaccine"].tolist()
y_test_mask = y_test["stockRise_mask"].tolist()
y_test_testKits = y_test["stockRise_testKits"].tolist()
y_test_vaccine = y_test["stockRise_vaccine"].tolist()

## 相關性分析（pearson）

In [14]:
covid_seti_stock = pd.concat([ x_train, y_train.iloc[:, 1:4] ], axis = 1)

In [15]:
covid_seti_stock

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score,stockRise_mask,stockRise_testKits,stockRise_vaccine
7271,-8.4,-5.6,8,-36.120000,0,0,1
317,-4.4,-13.2,28,-19.925714,-1,0,0
1925,-0.6,-3.4,3,-2.320000,1,0,0
3188,11.8,-10.2,17,51.920000,1,1,-1
7289,-17.8,-2.6,20,-86.686000,-1,1,1
...,...,...,...,...,...,...,...
6031,-1.2,-40.0,56,-5.142857,-1,1,1
5108,-5.8,-3.0,54,-28.677778,1,0,0
5302,8.6,-5.5,13,39.361538,0,-1,0
5994,-9.3,-5.1,12,-42.547500,0,0,-1


In [18]:
covid_seti_stock.corr()

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score,stockRise_mask,stockRise_testKits,stockRise_vaccine
Valence_Sum,1.0,-0.102465,-0.217906,0.997555,-0.00809,7.4e-05,0.016332
Arousal_Sum,-0.102465,1.0,-0.62084,-0.122668,-0.005795,-0.024196,0.011413
words_Num,-0.217906,-0.62084,1.0,-0.215588,0.006119,0.016564,-0.017786
weighted_score,0.997555,-0.122668,-0.215588,1.0,-0.007992,-0.000137,0.016067
stockRise_mask,-0.00809,-0.005795,0.006119,-0.007992,1.0,0.04463,0.021319
stockRise_testKits,7.4e-05,-0.024196,0.016564,-0.000137,0.04463,1.0,0.146321
stockRise_vaccine,0.016332,0.011413,-0.017786,0.016067,0.021319,0.146321,1.0


In [19]:
stock_seti_stock = pd.concat([ x_train, y_train.iloc[:, 1:4] ], axis = 1)

In [20]:
stock_seti_stock.corr()

Unnamed: 0,Valence_Sum,Arousal_Sum,words_Num,weighted_score,stockRise_mask,stockRise_testKits,stockRise_vaccine
Valence_Sum,1.0,0.506141,-0.507822,0.998632,-0.003337,0.024339,0.014364
Arousal_Sum,0.506141,1.0,-0.899368,0.485494,-0.007522,0.010416,0.010222
words_Num,-0.507822,-0.899368,1.0,-0.491561,0.010624,-0.01036,-0.00334
weighted_score,0.998632,0.485494,-0.491561,1.0,-0.003085,0.025024,0.014537
stockRise_mask,-0.003337,-0.007522,0.010624,-0.003085,1.0,0.105857,0.115206
stockRise_testKits,0.024339,0.010416,-0.01036,0.025024,0.105857,1.0,0.159451
stockRise_vaccine,0.014364,0.010222,-0.00334,0.014537,0.115206,0.159451,1.0


### 設定 SVM 參數

In [40]:
SVM_model = SVC(kernel = "linear", C = 1.0, probability=True)

### SVM Model 函式

In [41]:
def tokens_predict_stocks(x_train, y_train, x_test, y_test, tokens_name, predict_stocks_name, embedding):
    
    t0 = time()
    SVM_model.fit(x_train, y_train)
    print("done in %0.3fs." % (time() - t0))
    
    predicted_results = []
    excepted_results = []

    excepted_results.extend(y_test)
    predicted_results.extend(SVM_model.predict(x_test))
    
    print(tokens_name + " predict " + predict_stocks_name + "(SVM Linear + " + embedding + ")")
    print(metrics.classification_report(excepted_results,predicted_results))

### sentiment score predict stocks

### covid 板

#### 預測口罩類股漲跌效果

In [42]:
tokens_predict_stocks(x_train, y_train_mask, x_test, y_test_mask, "Covid sentiment score", "mask stock", "cvaw")

done in 11185.278s.
Covid sentiment score predict mask stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       557
           0       0.00      0.00      0.00       324
           1       0.41      1.00      0.59       622

    accuracy                           0.41      1503
   macro avg       0.14      0.33      0.20      1503
weighted avg       0.17      0.41      0.24      1503



  _warn_prf(average, modifier, msg_start, len(result))


#### 預測檢測試劑類股漲跌效果

In [43]:
tokens_predict_stocks(x_train, y_train_testKits, x_test, y_test_testKits, "Covid sentiment score", "testKits stock", "cvaw")

done in 9324.887s.
Covid sentiment score predict testKits stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       416
           0       0.42      1.00      0.59       636
           1       0.00      0.00      0.00       451

    accuracy                           0.42      1503
   macro avg       0.14      0.33      0.20      1503
weighted avg       0.18      0.42      0.25      1503



  _warn_prf(average, modifier, msg_start, len(result))


#### 預測疫苗類股漲跌效果

In [44]:
tokens_predict_stocks(x_train, y_train_vaccine, x_test, y_test_vaccine, "Covid sentiment score", "vaccine stock", "cvaw")

done in 18145.245s.
Covid sentiment score predict vaccine stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       323
           0       0.52      1.00      0.69       787
           1       0.00      0.00      0.00       393

    accuracy                           0.52      1503
   macro avg       0.17      0.33      0.23      1503
weighted avg       0.27      0.52      0.36      1503



  _warn_prf(average, modifier, msg_start, len(result))


### scock 板

#### 預測口罩類股漲跌效果

In [48]:
tokens_predict_stocks(x_train, y_train_mask, x_test, y_test_mask, "Stock sentiment score", "mask stock", "cvaw")

done in 17531.810s.
Stock sentiment score predict mask stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       695
           0       0.00      0.00      0.00       458
           1       0.41      1.00      0.58       793

    accuracy                           0.41      1946
   macro avg       0.14      0.33      0.19      1946
weighted avg       0.17      0.41      0.24      1946



  _warn_prf(average, modifier, msg_start, len(result))


#### 預測檢測試劑類股漲跌效果

In [49]:
tokens_predict_stocks(x_train, y_train_testKits, x_test, y_test_testKits, "Stock sentiment score", "testKits stock", "cvaw")

done in 17738.621s.
Stock sentiment score predict testKits stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       525
           0       0.44      1.00      0.61       855
           1       0.00      0.00      0.00       566

    accuracy                           0.44      1946
   macro avg       0.15      0.33      0.20      1946
weighted avg       0.19      0.44      0.27      1946



  _warn_prf(average, modifier, msg_start, len(result))


#### 預測疫苗類股漲跌效果

In [50]:
tokens_predict_stocks(x_train, y_train_testKits, x_test, y_test_testKits, "Stock sentiment score", "testKits stock", "cvaw")

done in 18713.662s.
Stock sentiment score predict testKits stock(SVM Linear + cvaw)
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       525
           0       0.44      1.00      0.61       855
           1       0.00      0.00      0.00       566

    accuracy                           0.44      1946
   macro avg       0.15      0.33      0.20      1946
weighted avg       0.19      0.44      0.27      1946



  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
print("well done! YUO MADE IT!")

well done! YUO MADE IT!
