In [69]:
import pandas as pd
import yfinance as yf
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

In [48]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/abrham/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [49]:
ticker = 'AAPL'
start_date = '2010-01-01'
end_date = '2020-01-01'
stock_data = yf.download(ticker,start=start_date, end=end_date)
stock_data.reset_index(inplace=True)
stock_data.rename(columns={'Date':'date'}, inplace=True)
print(stock_data.head(10))


[*********************100%***********************]  1 of 1 completed

        date      Open      High       Low     Close  Adj Close     Volume
0 2010-01-04  7.622500  7.660714  7.585000  7.643214   6.454505  493729600
1 2010-01-05  7.664286  7.699643  7.616071  7.656429   6.465666  601904800
2 2010-01-06  7.656429  7.686786  7.526786  7.534643   6.362821  552160000
3 2010-01-07  7.562500  7.571429  7.466071  7.520714   6.351058  477131200
4 2010-01-08  7.510714  7.571429  7.466429  7.570714   6.393281  447610800
5 2010-01-11  7.600000  7.607143  7.444643  7.503929   6.336883  462229600
6 2010-01-12  7.471071  7.491786  7.372143  7.418571   6.264801  594459600
7 2010-01-13  7.423929  7.533214  7.289286  7.523214   6.353168  605892000
8 2010-01-14  7.503929  7.516429  7.465000  7.479643   6.316372  432894000
9 2010-01-15  7.533214  7.557143  7.352500  7.354643   6.210814  594067600





In [50]:
local_data = pd.read_csv('../data/raw_analyst_ratings.csv')
local_data['date'] = pd.to_datetime(local_data['date'], format='ISO8601',utc=True, errors='coerce')
local_data['date'] = local_data['date'].dt.date

print(local_data.head(10))


   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   
5           5  CFRA Maintains Hold on Agilent Technologies, L...   
6           6  UBS Maintains Neutral on Agilent Technologies,...   
7           7  Agilent Technologies shares are trading higher...   
8           8  Wells Fargo Maintains Overweight on Agilent Te...   
9           9         10 Biggest Price Target Changes For Friday   

                                                 url                publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...        Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...        Benzinga Insights   
2  https

In [51]:
sia = SentimentIntensityAnalyzer()

# Apply sentiment analysis on each headline
local_data['sentiment'] = local_data['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Group by date and calculate the average sentiment for each day
daily_sentiment = local_data.groupby('date')['sentiment'].mean().reset_index()


In [52]:
print(daily_sentiment.head(10))

         date  sentiment
0  2009-02-14    0.22630
1  2009-04-27    0.00000
2  2009-04-29    0.00000
3  2009-05-22    0.00000
4  2009-05-27    0.75105
5  2009-05-29    0.00000
6  2009-05-30    0.00000
7  2009-06-01    0.28680
8  2009-06-02    0.00000
9  2009-06-05    0.00000


In [62]:
stock_data['date'] = pd.to_datetime(stock_data['date'])
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

# Merge the stock data with the sentiment datadate
merged_data = pd.merge(stock_data[['Close', 'date']], daily_sentiment, how='inner', on='date')

# Sort the data by date
merged_data.sort_values('date', inplace=True)


In [64]:
print(merged_data.head(10))

      Close       date  sentiment
0  7.643214 2010-01-04   0.083270
1  7.656429 2010-01-05   0.174566
2  7.534643 2010-01-06   0.114301
3  7.520714 2010-01-07   0.098274
4  7.570714 2010-01-08   0.090283
5  7.503929 2010-01-11   0.135564
6  7.418571 2010-01-12   0.069658
7  7.523214 2010-01-13   0.094011
8  7.479643 2010-01-14   0.090039
9  7.354643 2010-01-15   0.069114


In [66]:
merged_data['daily_return'] = merged_data['Close'].pct_change()
merged_data.dropna(inplace=True)

In [72]:
# Calculate the Pearson correlation 
corellation = merged_data['daily_return'].corr(merged_data['sentiment'])
print(corellation)

0.14355305489212528


In [73]:
fig = px.scatter(merged_data, x='sentiment', y='daily_return', trendline='ols', title=f'{ticker} Stock Sentiment vs. Daily Return')
fig.show()