In [1]:
# Initial imports
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
import requests
import datetime
from datetime import datetime, timedelta, date
from dateutil.parser import parse
import matplotlib.pyplot as plt

#Alpaca API imports
import alpaca_trade_api as tradeapi

#News API imports
from newsapi import NewsApiClient


# NLP & Sentiment imports
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

from dotenv import load_dotenv
load_dotenv()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Kris/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
############################################################
"""
    Authenticates the Alpaca API and Twitter
    Returns a pass/fail statement
"""
############################################################  
    
# Set News API Key
newsapi = NewsApiClient(api_key=os.environ["NEWS_API_KEY"])

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")
alpaca_api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [12]:
############################################################
"""
    Function that pulls stock data from a given ticker and timeframe.
"""
############################################################

def daily_returns(ticker, timeframe):
        
    # Set current date and the date from one month ago using the ISO format
    current_date = pd.Timestamp.now(tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2021-01-01 00:00", tz="America/New_York").isoformat()

    # Get 4 weeks worth of historical data for AAPL
    df = alpaca_api.get_barset(
        ticker,
        timeframe,
        limit=None,
        start=past_date,
        end=current_date,
        after=None,
        until=None,
    ).df
        
    return df


############################################################
"""
    get_ticker_data(ticker):
    Takes 1 argument, ticker to search tweets for.
    
    Scrapes articles for given search words.
    Calculates compound sentiment with VADER sentiment analyzer 
    Normalizes VADER compound score
    Returns Average Daily Sentiment Dataframe with Columns: 
"""
############################################################

def get_ticker_data(ticker):
    
    # alpaca api variables
    ticker = ticker
    timeframe = "1D"

    # call the alpaca api and return a dataframe of daily returns
    daily_df = daily_returns(ticker, timeframe)

    # Drop Outer Table Level and drop extra columns
    daily_df = daily_df.droplevel(axis=1, level=0)
    daily_df = daily_df.drop(columns=["open", "high", "low", "volume"])

    # Get the percent change of the closing prices, drop any NA rows, and reset the index
    daily_df["percent_change"] = daily_df.pct_change().dropna()
    
    # Grouping the tweets by Hour and taking their average Hourly sentiment
    avg_returns = daily_df.groupby(pd.Grouper(level=0, freq='H')).mean().dropna()

    return daily_df

############################################################
"""
    News API: Pull Articles 
    Use newsapi client to get most relevant 
    20 headlines per day in the past month
"""
############################################################

def get_articles(keyword):
      
    # Set current date and the date from one month ago using the ISO format
    now = pd.Timestamp.now(tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2020-12-18T00:00:00", tz="America/New_York").isoformat()
    
    articles = newsapi.get_everything(
        q=keyword,
        from_param=str(past_date),
        to=str(now),
        language='en',
        sort_by='publishedAt',
        page=1,
        )
        
    return articles


############################################################
"""
    News API: Analyze Article Sentiment
"""
############################################################

def news_sentiment(news_df):
       
    # Initialize the VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
     
    # Create the sentiment scores DataFrame
    sentiments = []

    for article in news_df["articles"]:
        try:
            text = article["content"]
            date = pd.Timestamp(article["publishedAt"], tz="America/New_York").isoformat() 
            sentiment = analyzer.polarity_scores(text)
            compound = sentiment["compound"]
            #pos = sentiment["pos"]
            #neu = sentiment["neu"]
            #neg = sentiment["neg"]
            
            sentiments.append({
                "text": text,
                "date": date,
                "compound": compound,
                #"positive": pos,
                #"negative": neg,
                #"neutral": neu
                
            })
            
        except AttributeError:
            pass
    
    df = pd.DataFrame(sentiments)
    cols = ["date", "text", "compound"]
    df = df[cols]
    
    return df



############################################################
"""
    get_news_sentiment(search_words):
    Takes 1 argument, word(s) to search articles for.
    
    Scrapes articles for given search words 
    Calculates compound sentiment with VADER sentiment analyzer on each article
    Calculates average compound sentiment score each 1 hour
    Normalizes average hourly VADER compound score
    Returns Average Hourly Sentiment Dataframe with Columns: 
"""
############################################################

def get_avg_sentiment(search_words):

    # newsapi variables
    search_words = search_words
    
    # fetch articles in dataframe
    news = get_articles(search_words)

    # call the twitter sentiment function and return a dataframe
    news_sentiment_df = news_sentiment(news)

    # Changes the date column to proper datetime format
    news_sentiment_df['date'] = pd.to_datetime(news_sentiment_df['date'])
    
    # Grouping the tweets by Hour and taking their average Hourly sentiment
    avg_hourly_sentiment = news_sentiment_df.groupby(pd.Grouper(key='date', freq='H')).mean().dropna()
    
    # Get the normalized sentiment score of -1, 0, 1
    avg_hourly_sentiment["normalized"] = avg_hourly_sentiment["compound"].apply(lambda x : get_normalized(x))
    
    return avg_hourly_sentiment


############################################################
"""
    Sentiment calculation based on compound score
"""
############################################################

def get_normalized(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.04:  # Positive
        result = 1
    elif score <= -0.04:  # Negative
        result = -1

    return result


############################################################
"""
    combine_sentiment_with_close(avg_hourly_returns, avg_hourly_sentiment):
    Takes 2 arguments: the two dataframes to combine
"""
############################################################

def combine_sentiment_with_close(avg_hourly_sentiment, avg_hourly_returns):

    # Combines the average hourly Twitter sentiment dataframe with the hourly percent change dataframe
    combined_df = avg_hourly_returns.join(avg_hourly_sentiment).dropna(how="any")
    
    return combined_df


############################################################
"""
    Machine Learning Model
    Input the Sentiment data and return predicted stock returns
"""
############################################################

def get_predictions(sentiment_df):


    return predicted_df



############################################################
"""
    main_function:
    Defines the main function.
    1 argument: search word to search twitter and stock prices.
"""
############################################################

def main_function(news_search_word, ticker):
      
    news_sentiment = get_avg_sentiment(news_search_word)
    percent_change = get_ticker_data(ticker)
    combined_df = combine_sentiment_with_close(news_sentiment, percent_change) 
    
    return combined_df

In [13]:
news_search_word = "microsoft AND MSFT" #search_word
ticker = "MSFT" #ticker

combined_df = main_function(news_search_word, ticker)
combined_df

Unnamed: 0_level_0,close,percent_change,compound,normalized
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [1]:
# import the sentiment ML model


# feed the twitter data into the model


# output predicted returns to df


# return df 


In [16]:
news = get_articles("microsoft")
news

{'status': 'ok',
 'totalResults': 15695,
 'articles': [{'source': {'id': None, 'name': 'CNA'},
   'author': 'CNA',
   'title': 'US lawmakers aim to curtail face recognition even as the technology identifies Capitol attackers',
   'description': 'OAKLAND:\xa0US\xa0lawmakers are moving ahead with efforts to ban facial recognition software even as the technology helps identify supporters of President Donald Trump who stormed the Capitol this month.\n\nResearchers and law enforcement have been running photograph…',
   'url': 'https://www.channelnewsasia.com/news/world/face-recognition-capitol-identity-law-13990796',
   'urlToImage': 'https://cna-sg-res.cloudinary.com/image/upload/q_auto,f_auto/image/13990792/16x9/991/557/203191f8df567c1150d93ead6f0a5f5a/sT/file-photo--supporters-of-u-s--president-donald-trump-storm-the-u-s--capitol-1.jpg',
   'publishedAt': '2021-01-19T00:46:01Z',
   'content': 'OAKLAND:\xa0US\xa0lawmakers are moving ahead with efforts to ban facial recognition software ev

In [14]:
news_sentiment = get_avg_sentiment("microsoft")
news_sentiment


Unnamed: 0_level_0,compound,normalized
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-18 18:00:00-05:00,-0.094644,-1
2021-01-18 19:00:00-05:00,0.111127,1
