In [13]:
import requests
import pandas
import datetime
import json
import time

In [4]:
# VGT Holdings List
vgt_ticker_list = ["MSFT","AAPL","NVDA","AVGO","AMD","CRM","ADBE","ACN","CSCO","INTU","ORCL","INTC","QCOM","AMAT","IBM","NOW","TXN","LRCX","MU","ADI","KLAC","PANW","SNPS","CDNS","CRWD",
                    "ANET","APH","NXPI","MRVL","WDAY","ROP","ADSK","MSI","SNOW","SMCI","FTNT","PLTR","MCHP","TEL","CTSH","IT","DDOG","MPWR","ON","CDW","TEAM","MDB","FICO","HUBS","NET"]
# Fetch news articles from polygon
with open("news_articles.json", "w") as f:
    for ticker in vgt_ticker_list:
        try:
            news_articles = requests.get("https://api.polygon.io/v2/reference/news?ticker=" + ticker + "&order=asc&limit=1000&sort=published_utc&apiKey=BsGfmvFKJ_9kIzo8sxfnLo4NfvaPZ5Gm").json()
            results = news_articles["results"]
            f.write(json.dumps(results))
            print("Wrote " + ticker + " articles")
        except KeyError:
            print("No results found for " + ticker)
        
        time.sleep(13) # Basic API plan allows for 5 calls per minute we can only do 1 ticker at once
        

Wrote MSFT articles
Wrote AAPL articles
Wrote NVDA articles
Wrote AVGO articles
Wrote AMD articles
Wrote CRM articles
Wrote ADBE articles
Wrote ACN articles
Wrote CSCO articles
Wrote INTU articles
Wrote ORCL articles
Wrote INTC articles
Wrote QCOM articles
Wrote AMAT articles
Wrote IBM articles
Wrote NOW articles
Wrote TXN articles
Wrote LRCX articles
Wrote MU articles
Wrote ADI articles
Wrote KLAC articles
Wrote PANW articles
Wrote SNPS articles
Wrote CDNS articles
Wrote CRWD articles
Wrote ANET articles
Wrote APH articles
Wrote NXPI articles
Wrote MRVL articles
Wrote WDAY articles
Wrote ROP articles
Wrote ADSK articles
Wrote MSI articles
Wrote SNOW articles
Wrote SMCI articles
Wrote FTNT articles
Wrote PLTR articles
Wrote MCHP articles
Wrote TEL articles
Wrote CTSH articles
Wrote IT articles
Wrote DDOG articles
Wrote MPWR articles
Wrote ON articles
Wrote CDW articles
Wrote TEAM articles
Wrote MDB articles
Wrote FICO articles
Wrote HUBS articles
Wrote NET articles


In [19]:
with open('news_articles.json', 'r') as news_articles:
    contents = json.loads(news_articles.read())

In [20]:
    # Here we use the links from the news articles from polygon to extract the actual full article from the link since we don't have enough info to gather sentiment yet
    # This takes a VERY long time
    with open('extract-news-api.json', 'w') as extract_news_file:
        for article in contents:
            # Extract more article text by link since polygon only gives us article description which is not enough
            article_details = requests.get("http://127.0.0.1:5000/v0/article?url=" + article["article_url"])
            extract_news_file.write(json.dumps(article_details.json()) + ",\n")
            time.sleep(.01)

In [2]:
import nltk
import multiprocessing

In [3]:
nltk.download(["stopwords",
...     "state_union",
...     "twitter_samples",
...     "movie_reviews",
...     "averaged_perceptron_tagger",
...     "vader_lexicon",
...     "punkt",
        "names"])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   

True

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [5]:
from statistics import mean

def is_positive(text: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [6]:
stopwords = nltk.corpus.stopwords.words("english")

positive_words = ['bull', 'bullish', 'rally', 'grow', 'long', 'up', 'growth', 'rose', 'rise', 'turnaround', '+', 'grew']

negative_words = ['bear', 'bearish', 'fall', 'short', 'down', 'shrink', 'shrunk', '-']

In [17]:
with open('extract-news-api.json', 'r') as extracted_news_file:
    extracted_news = []

    for line in extracted_news_file:
        extracted_news.append(json.loads(line[:-2]))
    
    def extract_nth_news(nth_article):
        
        if nth_article["status"] == "ok":

            fd = nltk.FreqDist(nltk.word_tokenize(nth_article['article']['text'].lower()))
            
            positive_score = 0
            for word, freq in fd.items():
                if word in positive_words:
                    positive_score += freq

            negative_score = 0
            for word, freq in fd.items():
                if word in negative_words:
                    negative_score += freq
            
            
            vader_polarity = sia.polarity_scores(nth_article["article"]["text"])

            # I calculate my own score that weighs the frequency distributions of certain important words but takes into acount vader polarity too
            # VADER is trained on twitter messages so using it on articles is less accurate which is why it has a lower weighting
            # I use VADER to try and account for things in the articles that might mention "is no longer bullish" for example
            # If I did this again, I would implement colocation techniques to differentiate this better
            polarity_scores = {'positive-score': positive_score * vader_polarity['pos'], 'negative-score': negative_score  * vader_polarity['neg']}

            polarity_scores["published"] = nth_article["article"]["published"]
            return polarity_scores
        
        return {}
    
    with open('news-sentiment-data.json', 'w') as sentiment_file:
        sentiment_file.write('[')
        for news_article in extracted_news:
            sentiment_file.write(json.dumps(extract_nth_news(news_article)) + ",")
        sentiment_file.write(']')

In [11]:
from dateutil import parser
import pandas
import yfinance as yf
from pandas_datareader import data as pdr

In [19]:
vgt = yf.Ticker("VGT")
print(vgt.history())

                                 Open        High         Low       Close   
Date                                                                        
2024-06-27 00:00:00-04:00  575.489704  578.965094  574.860531  576.797974  \
2024-06-28 00:00:00-04:00  580.299988  586.109985  576.000000  576.590027   
2024-07-01 00:00:00-04:00  578.030029  583.090027  570.919983  582.250000   
2024-07-02 00:00:00-04:00  578.929993  586.119995  578.510010  586.119995   
2024-07-03 00:00:00-04:00  585.599976  593.010010  585.450012  592.890015   
2024-07-05 00:00:00-04:00  593.450012  597.440002  591.969971  595.909973   
2024-07-08 00:00:00-04:00  597.979980  600.520020  596.500000  599.909973   
2024-07-09 00:00:00-04:00  602.289978  603.049988  595.729980  598.630005   
2024-07-10 00:00:00-04:00  602.179993  607.130005  599.929993  606.849976   
2024-07-11 00:00:00-04:00  608.539978  608.539978  592.539978  595.159973   
2024-07-12 00:00:00-04:00  596.549988  606.020020  595.000000  600.700012   

In [17]:
data = []

def percent_change(old, new):
    pc = round((new - old) / abs(old) * 100, 2)
    print(f"from {old} to {new}   -> {pc}% change")

with open('news-sentiment-data.json', 'r') as sentiment_file:
    
    with open('market-data.json', 'w') as market_data_file:

        market_data_file.write('[')
        for news_report in json.load(sentiment_file):    
            if news_report["published"]:
                date_published = parser.parse(news_report["published"])
                next_day = date_published + datetime.timedelta(days=1)
                
                try:
                    test = [
                                news_report['published']
                                 , percent_change( vgt.Open[date_published.date().strftime('%Y-%m-%d')], vgt.Open[next_day.date().strftime('%Y-%m-%d')] )
                                 , percent_change( vgt.High[date_published.date().strftime('%Y-%m-%d')], vgt.High[next_day.date().strftime('%Y-%m-%d')] )
                                 , percent_change( vgt.Low[date_published.date().strftime('%Y-%m-%d')], vgt.Low[next_day.date().strftime('%Y-%m-%d')] )
                                ]
                    data.append(test)

                    market_data_file.write('{"open" : ' + str(vgt.Open[date_published.date().strftime('%Y-%m-%d')])+",")
                    market_data_file.write('"next-day-open" : ' + str(vgt.Open[next_day.date().strftime('%Y-%m-%d')]) + ",\n")
                    market_data_file.write('"high" : ' + str(vgt.High[date_published.date().strftime('%Y-%m-%d')])+",")
                    market_data_file.write('"next-day-high" : ' +str(vgt.High[next_day.date().strftime('%Y-%m-%d')]) + ",\n")
                    market_data_file.write('"low" : ' + str(vgt.Low[date_published.date().strftime('%Y-%m-%d')])+",")
                    market_data_file.write('"next-day-low" : ' + str(vgt.Low[next_day.date().strftime('%Y-%m-%d')]) + "},\n")
                except:
                    market_data_file.write('},{"":""},\n')

                
        market_data_file.write(']')

JSONDecodeError: Expecting value: line 1 column 2955792 (char 2955791)