In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import pandas as pd
stock_price = pd.read_csv("Stock Market Prediction Analysis/DJIA_table(train).csv")[['Date', 'Close']]
stock_price["Date"] = pd.to_datetime(stock_price["Date"], format='%d-%m-%Y')
stock_price.sort_values(by='Date', inplace=True)
stock_price.set_index('Date', inplace=True)
stock_price.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2008-08-08,11734.32031
2008-08-11,11782.34961
2008-08-12,11642.46973
2008-08-13,11532.95996
2008-08-14,11615.92969


In [3]:
# 每日漲跌狀況，收盤價上升或維持不變時，為「1」；當道瓊工業平均收盤價下降時，為「0」
stock_price['up_down'] = (stock_price['Close'].shift(1) < stock_price['Close']).astype(int)
stock_price.dropna(inplace=True)
stock_price.head()

Unnamed: 0_level_0,Close,up_down
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-08-08,11734.32031,0
2008-08-11,11782.34961,1
2008-08-12,11642.46973,0
2008-08-13,11532.95996,0
2008-08-14,11615.92969,1


In [4]:
# 準備 文本訓練資料 
text_data = pd.read_csv("Stock Market Prediction Analysis/RedditNews(train).csv")
text_data.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [5]:
from transformers import pipeline, BertTokenizer, BertForSequenceClassification

# 載入 week10 訓練好的文本情緒模型
model = BertForSequenceClassification.from_pretrained("./my_finance_model")
tokenizer = BertTokenizer.from_pretrained("./my_finance_model")

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0)

input_texts = text_data['News'].tolist()

predictions = classifier(input_texts)


  from .autonotebook import tqdm as notebook_tqdm


In [66]:
result_df = pd.DataFrame(predictions)
# 當日rank越高的新聞，設定越高的權重
result_df['text'] = input_texts
result_df['Date'] = text_data['Date']

# 只保留每日前 n 則新聞
n=22
result_df = result_df.groupby('Date').head(n)
print(len(result_df))


result_df['weight'] = [i for i in range(n, 0, -1)]*((len(result_df)//n))
result_df['weight'] = result_df['weight']/sum(range(1,n+1))
result_df['Date'] = text_data['Date']
# -1: 負面, 1: 正面
result_df['sentiment'] = result_df['label'].map({'LABEL_0': 0, 'LABEL_1': 100})

result_df = result_df[['Date', 'text', 'sentiment', 'weight']]

# 計算每日情緒分數, 單 row 的 sentiment 乘以 weight
result_df["Weighted_sentiment"] = result_df['sentiment'] * result_df['weight']
result_df["Weighted_sentiment_by_day"] = result_df.groupby('Date')['Weighted_sentiment'].transform(lambda x: x.sum())

result_df.head()

64746


Unnamed: 0,Date,text,sentiment,weight,Weighted_sentiment,Weighted_sentiment_by_day
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,100,0.086957,8.695652,22.924901
1,2016-07-01,IMF chief backs Athens as permanent Olympic host,100,0.083004,8.300395,22.924901
2,2016-07-01,"The president of France says if Brexit won, so...",0,0.079051,0.0,22.924901
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,0,0.075099,0.0,22.924901
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,0,0.071146,0.0,22.924901


In [70]:
result_df = result_df[['Date', 'Weighted_sentiment_by_day']].drop_duplicates()
result_df.sort_values(by='Date', inplace=True)
result_df.head()


Unnamed: 0,Date,Weighted_sentiment_by_day
73583,2008-06-08,89.328063
73558,2008-06-09,97.628458
73533,2008-06-10,97.628458
73508,2008-06-11,100.0
73483,2008-06-12,96.837945


In [73]:
# 存下每日情緒分數
result_df.to_csv(f"week11/daily_sentiment_score_{result_df['Date'].iloc[0]}-{result_df['Date'].iloc[-1]}.csv", index=False)
