In [1]:
# Initial imports
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
import requests
import datetime
from datetime import datetime, timedelta, date
from dateutil.parser import parse
import matplotlib.pyplot as plt

#Alpaca API imports
import alpaca_trade_api as tradeapi

#News API imports
from newsapi import NewsApiClient


# NLP & Sentiment imports
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

from dotenv import load_dotenv
load_dotenv()


Bad key "text.kerning_factor" on line 4 in
/Users/albertkong/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/albertkong/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Set News API Key
api_key=os.getenv("NEWS_API_KEY")
newsapi = NewsApiClient(api_key)

In [3]:
############################################################
"""
    News API: Pull Articles 
    Use newsapi client to get most relevant 
"""
############################################################

def get_articles(search_word):
    
    # Set current date and the date from one month ago using the ISO format
    now = pd.Timestamp.now(tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2020-12-21T00:00:00", tz="America/New_York").isoformat()
    
    articles = newsapi.get_everything(
        q=search_word,
        from_param=str(past_date),
        to=str(now),
        language='en',
        sort_by='publishedAt',
        )
        
    return articles


############################################################
"""
    News API: Analyze Article Sentiment
"""
############################################################

def news_sentiment(news_df):
       
    # Initialize the VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
     
    # Create the sentiment scores DataFrame
    sentiments = []

    for article in news_df["articles"]:
        try:
            text = article["content"]
            date = pd.to_datetime(article["publishedAt"]) 
            sentiment = analyzer.polarity_scores(text)
            compound = sentiment["compound"]
            #pos = sentiment["pos"]
            #neu = sentiment["neu"]
            #neg = sentiment["neg"]
            
            sentiments.append({
                "text": text,
                "date": date,
                "compound": compound,
                #"positive": pos,
                #"negative": neg,
                #"neutral": neu
                
            })
            
        except AttributeError:
            pass
    
    df = pd.DataFrame(sentiments)
    cols = ["date", "text", "compound"]
    df = df[cols]
    
    return df


############################################################
"""
    get_avg__news_sentiment(search_words):
    Takes 1 argument, word(s) to search articles for.
    
    Scrapes articles for given search words 
    Calculates compound sentiment with VADER sentiment analyzer on each article
    Calculates average compound sentiment score each 1 hour
    Normalizes average hourly VADER compound score
    Returns Average Hourly Sentiment Dataframe with Columns: 
"""
############################################################

def get_avg_news_sentiment(ticker, search_word):
   
    # fetch articles in dataframe
    news = get_articles(search_word + " AND " + ticker)

    # call the sentiment function and return a dataframe
    news_sentiment_df = news_sentiment(news)

    # Changes the date column to proper datetime format
    # news_sentiment_df['date'] = pd.Timestamp(news_sentiment_df['date'])
    # news_sentiment_df.sort_values(by="date", axis=0, ascending=False).dropna()
    
    # Grouping the tweets by Hour and taking their average Hourly sentiment
    avg_hourly_sentiment = news_sentiment_df.groupby(pd.Grouper(key="date", freq='H')).mean().dropna()
    
    # Get the normalized sentiment score of -1, 0, 1
    # avg_hourly_sentiment["normalized"] = avg_hourly_sentiment["compound"].apply(lambda x : get_normalized(x))
    
    return avg_hourly_sentiment


############################################################
"""
    Sentiment calculation based on compound score
"""
############################################################

def get_normalized(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.04:  # Positive
        result = 1
    elif score <= -0.04:  # Negative
        result = -1

    return result


############################################################
    """
    This function accepts the column number for the features (X) .
    It chunks the data up with a rolling window of Xt - window to predict Xt.
    It returns a numpy array of X.
    
    `df`: The original DataFrame with the time series data.
    `window`: The window size in days of previous closing prices that will be used for the prediction.
    `feature_col_number`: The column number from the original DataFrame where the features are located.
  
    """
############################################################

def window_data(df, window, feature_col_number):

    X = []
    for i in range(len(df) - window):
        features = df.iloc[i : (i + window), feature_col_number]
        X.append(features)
    return np.array(X)

############################################################
"""
    Machine Learning Model
    Input the Sentiment data and return predicted stock returns
"""
############################################################

def get_news_predictions(combined_df):
    
    window_size = 1
    X = window_data(combined_df, window_size, 0)

    # Use the MinMaxScaler to scale data between 0 and 1.
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    # Reshape the features for the model
    X = X.reshape((X.shape[0], X.shape[1], 1))
    
    # Load the sentiment model 
    from tensorflow.keras.models import model_from_json

    # load json and create model
    file_path = Path("ml_model/sentiment_model.json")
    with open(file_path, "r") as json_file:
        sentiment_model_json = json_file.read()
    sentiment_loaded_model = model_from_json(sentiment_model_json)
    
    # load weights into new model
    file_path = "ml_model/sentiment_model.h5"
    sentiment_loaded_model.load_weights(file_path)
    
    # Make some predictions with the loaded model
    predicted = sentiment_loaded_model.predict(X)
    
    # Recover the original prices instead of the scaled version
    predicted_prices = scaler.inverse_transform(predicted)
    
    # Create a DataFrame of Predicted values
    predicted_returns = pd.DataFrame({
        "Predicted": predicted_prices.ravel()
        })

    return predicted_returns


############################################################
"""
    main_function:
    Defines the main function.
    2 argument: search word to search twitter and stock prices.
"""
############################################################

def news_sentiment_main_function(ticker, search_word):
      
    news_sentiment = get_avg_news_sentiment(ticker, search_word) 
    predicted_df_all = get_news_predictions(news_sentiment)
    predicted_df = predicted_df_all.head(5)
    
    return predicted_df

In [4]:
# search_word = "microsoft" #search_word
# ticker = "MSFT" #ticker

# predicted_df = news_sentiment_main_function(ticker, search_word)
# predicted_df



Unnamed: 0,Predicted
0,0.135716
1,0.135716
2,0.135531
3,0.135622
4,0.135716
