In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [2]:
data = pd.read_csv("data/NEWS_YAHOO_stock_prediction.csv")[['Date', 'title', 'Volume', 'Close']]

In [3]:
data.head()

Unnamed: 0,Date,title,Volume,Close
0,2020-01-27,Apple Set To Beat Q1 Earnings Estimates Tech ...,161940000,77.237503
1,2020-01-27,Tech Daily Intel Results Netflix Surge Appl...,161940000,77.237503
2,2020-01-27,7 Monster Stock Market Predictions For The Wee...,161940000,77.237503
3,2020-01-27,Apple Earnings Preview 5G Launch Expanding S...,161940000,77.237503
4,2020-01-27,Buy Surging Apple Microsoft Stock Before Qua...,161940000,77.237503


In [4]:
data = data.drop_duplicates(subset=['Date', 'Volume', 'Close'])

In [5]:
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d').dt.date

In [6]:
data.head()

Unnamed: 0,Date,title,Volume,Close
0,2020-01-27,Apple Set To Beat Q1 Earnings Estimates Tech ...,161940000,77.237503
8,2020-01-24,What To Do If A Stock On Your Buy List Has A...,146537600,79.577499
24,2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,104472000,79.807503
48,2020-01-22,Zacks Market Edge Highlights VEGN Apple Mic...,101832400,79.425003
78,2020-01-21,Apple s AAPL Low Cost IPhones Likely To Debu...,110843200,79.142502


In [7]:
data = data.set_index("Date")

In [8]:
data = data.asfreq('D').fillna(method='ffill')
data

Unnamed: 0_level_0,title,Volume,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-07-23,Summer Heat Scorches Europe And U S,487975600.0,21.565357
2012-07-24,Market Bait And Switch,565132400.0,21.461430
2012-07-25,Market Bait And Switch,565132400.0,21.461430
2012-07-26,Market Bait And Switch,565132400.0,21.461430
2012-07-27,Will AAPL Fall From The Tree,403936400.0,20.898571
...,...,...,...
2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,104472000.0,79.807503
2020-01-24,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.577499
2020-01-25,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.577499
2020-01-26,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.577499


In [9]:
data['title'].sort_index()

Date
2012-07-23                 Summer Heat Scorches Europe And U S 
2012-07-24                               Market Bait And Switch
2012-07-25                               Market Bait And Switch
2012-07-26                               Market Bait And Switch
2012-07-27                       Will AAPL Fall From The Tree  
                                    ...                        
2020-01-23    Will GPU Adoption   EPYC Deal Wins Aid AMD s Q...
2020-01-24    What To Do If A Stock On Your  Buy List  Has A...
2020-01-25    What To Do If A Stock On Your  Buy List  Has A...
2020-01-26    What To Do If A Stock On Your  Buy List  Has A...
2020-01-27    Apple Set To Beat Q1 Earnings Estimates  Tech ...
Freq: D, Name: title, Length: 2745, dtype: object

#### Separating news data from the original data set for sentiment analysis.

In [10]:
# %%timeit
# Load FinBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3) # 3 classes: positive, negative, neutral

# Function to perform sentiment analysis
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    sentiment_scores = probabilities[:, 0] - probabilities[:, 1]  # positive - negative
    return sentiment_scores.item()  # return a single score for the text

# Example DataFrame with news text column
# data = {'news_text': ['Some positive news about the stock.', 'Negative sentiment in the market.', 'Neutral news for the company.']}
df = pd.DataFrame()
df['title'] = data['title']

In [11]:
data['title'].value_counts()

Stock Breakouts  Breakdowns And In Betweens                                    14
U S  stocks higher at close of trade  Dow Jones Industrial Average up 0 27     13
What s Driving China s Real Estate Rally   Part 3                              10
Apple Computer True Bullish Impulse Leg                                        10
Fed Effect Appears To Be Tapering Off                                           9
                                                                               ..
Japan Inc may win Sharp battle  but lose the LCD war                            1
UberEATS standalone app to come to U S  in coming weeks                         1
Apple makes progress on gender  racial diversity                                1
Samsung Electronics to produce new Qualcomm chips in blow to TSMC               1
Apple Set To Beat Q1 Earnings Estimates  Tech ETFs To Buy                       1
Name: title, Length: 1651, dtype: int64

In [12]:
# Apply sentiment analysis function to the news_text column
df['sentiment_score'] = df['title'].apply(analyze_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
df.sort_index()

Unnamed: 0_level_0,title,sentiment_score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-07-23,Summer Heat Scorches Europe And U S,0.999247
2012-07-24,Market Bait And Switch,0.978765
2012-07-25,Market Bait And Switch,0.978765
2012-07-26,Market Bait And Switch,0.978765
2012-07-27,Will AAPL Fall From The Tree,0.967384
...,...,...
2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,0.992677
2020-01-24,What To Do If A Stock On Your Buy List Has A...,0.997706
2020-01-25,What To Do If A Stock On Your Buy List Has A...,0.997706
2020-01-26,What To Do If A Stock On Your Buy List Has A...,0.997706


In [14]:
df

Unnamed: 0_level_0,title,sentiment_score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-07-23,Summer Heat Scorches Europe And U S,0.999247
2012-07-24,Market Bait And Switch,0.978765
2012-07-25,Market Bait And Switch,0.978765
2012-07-26,Market Bait And Switch,0.978765
2012-07-27,Will AAPL Fall From The Tree,0.967384
...,...,...
2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,0.992677
2020-01-24,What To Do If A Stock On Your Buy List Has A...,0.997706
2020-01-25,What To Do If A Stock On Your Buy List Has A...,0.997706
2020-01-26,What To Do If A Stock On Your Buy List Has A...,0.997706


In [15]:
input_data = pd.merge(left= data, right=df, how = 'inner', on='Date')
input_data = input_data.rename(columns={'title_x':'title'}).drop(columns=['title_y'], axis=1)
input_data['Close'] = round(input_data['Close'], 2)

In [16]:
input_data

Unnamed: 0_level_0,title,Volume,Close,sentiment_score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-07-23,Summer Heat Scorches Europe And U S,487975600.0,21.57,0.999247
2012-07-24,Market Bait And Switch,565132400.0,21.46,0.978765
2012-07-25,Market Bait And Switch,565132400.0,21.46,0.978765
2012-07-26,Market Bait And Switch,565132400.0,21.46,0.978765
2012-07-27,Will AAPL Fall From The Tree,403936400.0,20.90,0.967384
...,...,...,...,...
2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,104472000.0,79.81,0.992677
2020-01-24,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706
2020-01-25,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706
2020-01-26,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706


In [17]:
# plt.plot(input_data['Close'])

In [21]:
input_data = input_data.reset_index()
input_data.to_csv("stock_data_with_sentiment_scores.csv", index=False)

In [None]:
input_data.plot(x='Date', y='Close', kind='line')

#### Analyzing the TSA features of the data

In [3]:
input_data = pd.read_csv("stock_data_with_sentiment_scores.csv")
input_data

Unnamed: 0,Date,title,Volume,Close,sentiment_score
0,2012-07-23,Summer Heat Scorches Europe And U S,487975600.0,21.57,0.999247
1,2012-07-24,Market Bait And Switch,565132400.0,21.46,0.978765
2,2012-07-25,Market Bait And Switch,565132400.0,21.46,0.978765
3,2012-07-26,Market Bait And Switch,565132400.0,21.46,0.978765
4,2012-07-27,Will AAPL Fall From The Tree,403936400.0,20.90,0.967384
...,...,...,...,...,...
2740,2020-01-23,Will GPU Adoption EPYC Deal Wins Aid AMD s Q...,104472000.0,79.81,0.992677
2741,2020-01-24,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706
2742,2020-01-25,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706
2743,2020-01-26,What To Do If A Stock On Your Buy List Has A...,146537600.0,79.58,0.997706
