# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import streamlit as st
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import requests
from io import StringIO
from dotenv import load_dotenv
from urllib.parse import urlencode

# Uncomment when ready for machine learning:
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Pulling up our .env file:
load_dotenv()

True

# Data Preprocessing

## Declarations and Functions

In [4]:
# Sets and Parameters

function_set = ['TIME_SERIES_DAILY', 'RSI', 'NEWS_SENTIMENT', 'REAL_GDP']
sa_function_parameters = ['technology', 'retail_wholesale']
premium_function_set = ['MACD']

parameters = {
    'TIME_SERIES_DAILY': {'function': 'TIME_SERIES_DAILY',
                          'symbol': None,
                          'outputsize': 'full',
                          'datatype': None,
                          'apikey': None},
    'NEWS_SENTIMENT': {'function': 'NEWS_SENTIMENT',
                       'tickers': None,
                       'limit': 1000,
                       'apikey': None},
    'RSI': {'function':'RSI',
            'symbol': None,
            'interval': 'daily',
            'time_period': 14,
            'series_type': 'close',
            'datatype': None,
            'apikey': None},
    'REAL_GDP': {'function': 'REAL_GDP',
                 'interval':'quarterly',
                 'datatype': None,
                 'apikey': None}
}

In [5]:
# Sentiment Extraction Function
#------
# Applied to the resulting DataFrame from the "NEWS_SENTIMENT" function to extract useful data.
#------

def extract_sentiment(df, symbol: str):

    # Conditional to verify that "symbol" is a string:
    if not isinstance(symbol, str):
        raise TypeError(f'The "symbol" parameter must be a strings; it\'s currently {str(type(symbol)).upper()}.')

    # Declaration of an empty DataFrames for extracting data and merging before return:
    df_time = pd.DataFrame(columns=['id', 'time_published'])
    df_sentiment = pd.DataFrame(columns=['id', 'relevance_score', 'ticker_sentiment_score', 'ticker_sentiment_label'])

    # Declaring the amount of returned articles to be looped over:
    count = df.shape[0]

    # Loop that pulls the "time_published", "relevance_score", "ticker_sentiment_score", and "ticker_sentiment_label" for the given stock symbol
    # from every row of the "NEWS_SENTIMENT" DataFrame:
    for id in range(count):
        time_row = pd.DataFrame({
            'id': [id],
            'time_published': [df.loc[id, 'feed']['time_published']]
        })
        df_time = pd.concat([df_time, time_row], ignore_index=True)
        for sentiment in df.loc[id, 'feed']['ticker_sentiment']:
            if sentiment['ticker'] == symbol:
                sentiment_row = pd.DataFrame({
                    'id': [id],
                    'relevance_score': [sentiment['relevance_score']],
                    'ticker_sentiment_score':[sentiment['ticker_sentiment_score']],
                    'ticker_sentiment_label': [sentiment['ticker_sentiment_label']]
                })
                df_sentiment = pd.concat([df_sentiment, sentiment_row], ignore_index=True)

    merged_df = df_time.merge(df_sentiment, on='id').drop(columns='id')

    return merged_df

In [6]:
# TimeSeries Indexer
#------
# Applied to every DataFrame produce by an API call.
#------

def set_time_index(df):
    for column in df.columns:
        if str(column).startswith('time') or str(column).endswith('time'):
            df.set_index(pd.to_datetime(df[column]).dt.date, inplace=True)
            df.drop(columns=column, inplace=True)
            df.index.name = 'time'
    return df

In [7]:
# Data Import Function
# ------
# Can create a tuple of DataFrames indexed on a TimeSeries, ready to be processed and merged into a train/test split.
# ------

def alpha_multicall(function_set, symbol: str, datatype: str, base_query='https://www.alphavantage.co/query?', apikey=os.getenv('ALPHAVANTAGE_API_KEY')):

    # Conditional to verify that "symbol" and "datatype" are strings:
    if not isinstance(symbol, str) or not isinstance(datatype, str):
        raise TypeError(f'Both the "symbol" and "datatype" parameters must be strings.\nSYMBOL: {str(type(symbol)).upper()}\nDATATYPE: {str(type(datatype)).upper()}')

    # DataFrame list to be converted to a tuple before being returned to the user:
    dataframes = []

    # The loop that applies the API key to each parameter set:
    for function in function_set:
        parameters[function]['apikey'] = apikey

    # The loop that looks through the "parameters" dictionary and verifies if "symbol" and "datatype" keys are present:
    for function in function_set:
        if 'symbol' and 'datatype' in parameters[function].keys():
            parameters[function]['symbol'] = symbol
            parameters[function]['datatype'] = datatype
        elif not 'symbol' and 'datatype' in parameters[function].keys():
            parameters[function]['datatype'] = datatype
        else:
            parameters[function]['symbol'] = symbol


    # The loop that makes the call for each function defined in the function set:
    for function in function_set:

        # The "NEWS_SENTIMENT" function only returns JSON with a ton of data that isn't relevant to a prediction model,
        # this conditional statement passes the resulting DataFrame from a "NEWS_SENTIMENT" call
        # through Sentiment Extraction Function:
        if function == 'NEWS_SENTIMENT':
            parameters[function]['tickers'] = symbol
            df = extract_sentiment(pd.DataFrame(requests.get(base_query + urlencode(parameters[function])).json()), symbol)
            df = set_time_index(df)
            dataframes.append(df)
        else:
            df = pd.read_csv(StringIO(requests.get(base_query + urlencode(parameters[function])).text))
            df = set_time_index(df)
            dataframes.append(df)

    # Returns a tuple so we can unpack all the returned DataFrames into separate objects:
    return tuple(dataframes)

## First Set

In [8]:
aapl_tsd, aapl_rsi, aapl_ns, gdp = alpha_multicall(function_set, 'AAPL', 'csv')

In [10]:
aapl_tsd.head()

Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-15,171.17,172.62,170.285,172.62,121752699
2024-03-14,172.91,174.3078,172.05,173.0,72571635
2024-03-13,172.77,173.185,170.76,171.13,51948951
2024-03-12,173.15,174.03,171.01,173.23,59544927
2024-03-11,172.94,174.38,172.05,172.75,58929918


In [None]:
aapl_rsi.shape

(6118, 1)

In [None]:
aapl_ns.shape

(689, 3)

In [None]:
aapl = aapl_tsd.merge(aapl_rsi, left_index=True, right_index=True) \
    .merge(aapl_ns, how='outer', left_index=True, right_index=True)

## Second Set