# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import streamlit as st
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import requests
from io import StringIO
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from urllib.parse import urlencode

# Uncomment when ready for machine learning:
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Pulling up our .env file:
load_dotenv()

True

# Data Preprocessing

## Declarations and Functions

In [4]:
# Sets and Parameters

function_set = ['TIME_SERIES_DAILY', 'RSI', 'NEWS_SENTIMENT', 'MACD']
single_function = ['NEWS_SENTIMENT']
sa_function_parameters = ['technology', 'retail_wholesale']
premium_function_set = ['MACD']

parameters = {
    'TIME_SERIES_DAILY': {
        'function': 'TIME_SERIES_DAILY',
        'symbol': None,
        'outputsize': 'full',
        'datatype': None,
        'apikey': None
        },
    'NEWS_SENTIMENT': {
        'function': 'NEWS_SENTIMENT',
        'tickers': None,
        'limit': 1000,
        'apikey': None
        },
    'RSI': {
        'function':'RSI',
        'symbol': None,
        'interval': 'daily',
        'time_period': 14,
        'series_type': 'close',
        'datatype': None,
        'apikey': None
        },
    'REAL_GDP': {
        'function': 'REAL_GDP',
        'interval':'quarterly',
        'datatype': None,
        'apikey': None
        },
    'MACD': {
        'function': 'MACD',
        'symbol': None,
        'interval': 'daily',
        'series_type':'close',
        'datatype': None,
        'apikey': None
    }
}

In [5]:
# Export CSV Function
#------
# Extracts a given DataFrame to a data folder within the working directory, appending a date value to the name.
#------

def export_csv(df, file_name: str):

    # Conditional to verify that "file_name" is a string:
    if not isinstance(file_name, str):
        raise TypeError(f'The "file_name" parameter must be a string; it\'s currently {str(type(file_name)).upper()}.')

    current_date = datetime.now().strftime('%Y-%m-%d')

    path = Path('{}/data'.format(os.getcwd()))
    path.mkdir(parents=True, exist_ok=True)
    extract_to = '{}/{}_{}.csv'.format(path, current_date, file_name)

    df.to_csv(extract_to)

    return print('Exported your DataFrame to \'{}\'!'.format(extract_to))

In [34]:
# Generate Features Function
#------
# Generates different lagged close, rolling mean/std and relative change features for the daily time series data.
#------

def generate_features(df):

    # Declaring the location of the close column; to insert the new features next it.
    close_location = df.columns.get_loc('close')

    # Lagged Close
    df.insert(close_location + 1, 'lag_1', df['close'].shift(1))
    df.insert(close_location + 2, 'lag_2', df['close'].shift(2))

    # Windowed Mean and STD
    df.insert(close_location + 3, 'rolling_mean_7', df['close'].rolling(window=7).mean())
    df.insert(close_location + 4, 'rolling_std_7', df['close'].rolling(window=7).std())

    # Relative Change
    df.insert(close_location + 5, 'daily_return', df['close'].pct_change() * 100)

    #Dropping any rows with resulting null values.
    df.dropna(inplace=True)

    return df

In [6]:
# Sentiment Extraction Function
#------
# Applied to the resulting DataFrame from the "NEWS_SENTIMENT" function to extract useful data.
#------

def extract_sentiment(df, symbol: str):

    # Conditional to verify that "symbol" is a string:
    if not isinstance(symbol, str):
        raise TypeError(f'The "symbol" parameter must be a string; it\'s currently {str(type(symbol)).upper()}.')

    # Declaration of an empty DataFrames for extracting data and merging before return:
    df_time = pd.DataFrame(columns=['id', 'time_published'])
    df_sentiment = pd.DataFrame(columns=['id', 'relevance_score', 'ticker_sentiment_score', 'ticker_sentiment_label'])

    # Declaring the amount of returned articles to be looped over:
    count = df.shape[0]

    # Loop that pulls the "time_published", "relevance_score", "ticker_sentiment_score", and "ticker_sentiment_label" for the given stock symbol
    # from every row of the "NEWS_SENTIMENT" DataFrame:
    for id in range(count):
        time_row = pd.DataFrame({
            'id': [id],
            'time_published': [df.loc[id, 'feed']['time_published']]
        })
        df_time = pd.concat([df_time, time_row], ignore_index=True)
        for sentiment in df.loc[id, 'feed']['ticker_sentiment']:
            if sentiment['ticker'] == symbol:
                sentiment_row = pd.DataFrame({
                    'id': [id],
                    'relevance_score': [sentiment['relevance_score']],
                    'ticker_sentiment_score':[sentiment['ticker_sentiment_score']],
                    'ticker_sentiment_label': [sentiment['ticker_sentiment_label']]
                })
                df_sentiment = pd.concat([df_sentiment, sentiment_row], ignore_index=True)

    merged_df = df_time.merge(df_sentiment, on='id').drop(columns='id')

    return merged_df

In [7]:
# TimeSeries Indexer
#------
# Applied to every DataFrame produce by an API call.
#------

def set_time_index(df):
    for column in df.columns:
        if str(column).startswith('time') or str(column).endswith('time'):
            df.set_index(pd.to_datetime(df[column]).dt.date, inplace=True)
            df.sort_index(inplace=True)
            df.drop(columns=column, inplace=True)
            df.index.name = 'time'
    return df

In [8]:
# Data Import Function
# ------
# Can create a tuple of DataFrames indexed on a TimeSeries, ready to be processed and merged into a train/test split.
# ------

def alpha_multicall(function_set, symbol: str, datatype: str, base_query='https://www.alphavantage.co/query?', apikey=os.getenv('ALPHAVANTAGE_API_KEY')):

    # Conditional to verify that "symbol" and "datatype" are strings:
    if not isinstance(symbol, str) or not isinstance(datatype, str):
        raise TypeError(f'Both the "symbol" and "datatype" parameters must be strings.\nSYMBOL: {str(type(symbol)).upper()}\nDATATYPE: {str(type(datatype)).upper()}')

    # Allow the user to enter a symbol in lowercase without breaking the call:
    symbol = symbol.upper()

    # DataFrame list to be converted to a tuple before being returned to the user:
    dataframes = []

    # The loop that applies the API key to each parameter set:
    for function in function_set:
        parameters[function]['apikey'] = apikey

    # The loop that looks through the "parameters" dictionary and verifies if "symbol" and "datatype" keys are present:
    for function in function_set:
        if 'symbol' and 'datatype' in parameters[function].keys():
            parameters[function]['symbol'] = symbol
            parameters[function]['datatype'] = datatype
        elif not 'symbol' and 'datatype' in parameters[function].keys():
            parameters[function]['datatype'] = datatype
        else:
            parameters[function]['symbol'] = symbol


    # The loop that makes the call for each function defined in the function set:
    for function in function_set:

        # The "NEWS_SENTIMENT" function only returns JSON with a ton of data that isn't relevant to a prediction model,
        # this conditional statement passes the resulting DataFrame from a "NEWS_SENTIMENT" call
        # through Sentiment Extraction Function:
        if function == 'NEWS_SENTIMENT':
            parameters[function]['tickers'] = symbol
            # df = pd.DataFrame(requests.get(base_query + urlencode(parameters[function])).json())
            df = extract_sentiment(pd.DataFrame(requests.get(base_query + urlencode(parameters[function])).json()), symbol)
            df = set_time_index(df)
            dataframes.append(df)
        else:
            df = pd.read_csv(StringIO(requests.get(base_query + urlencode(parameters[function])).text))
            df = set_time_index(df)
            dataframes.append(df)

    # Returns a tuple of DataFrames to be separated into multiple objects or single dataframe if only :
    return tuple(dataframes)

## First Set

In [9]:
aapl_tsd, aapl_rsi, aapl_ns, aapl_macd = alpha_multicall(function_set, 'AAPL', 'csv')

In [10]:
aapl_tsd.tail(10)

Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-13,172.77,173.185,170.76,171.13,51948951
2024-03-14,172.91,174.3078,172.05,173.0,72571635
2024-03-15,171.17,172.62,170.285,172.62,121752699
2024-03-18,175.57,177.71,173.52,173.72,75604184
2024-03-19,174.34,176.605,173.03,176.08,55215244
2024-03-20,175.72,178.67,175.09,178.67,53423102
2024-03-21,177.05,177.49,170.84,171.37,106181270
2024-03-22,171.76,173.05,170.06,172.28,71160138
2024-03-25,170.565,171.94,169.45,170.85,54288328
2024-03-26,170.0,171.42,169.6,169.71,57273178


In [11]:
aapl_tsd.shape

(6139, 5)

In [12]:
aapl_rsi.head(10)

Unnamed: 0_level_0,RSI
time,Unnamed: 1_level_1
1999-11-19,69.9838
1999-11-22,66.4703
1999-11-23,68.5184
1999-11-24,70.2012
1999-11-26,70.535
1999-11-29,69.4036
1999-11-30,72.5435
1999-12-01,76.5988
1999-12-02,80.7953
1999-12-03,83.0089


In [13]:
aapl_rsi.shape

(6125, 1)

In [14]:
aapl_macd.tail()

Unnamed: 0_level_0,MACD,MACD_Hist,MACD_Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-03-20,-2.4823,0.8488,-3.3311
2024-03-21,-2.5804,0.6005,-3.181
2024-03-22,-2.5553,0.5005,-3.0558
2024-03-25,-2.6206,0.3482,-2.9688
2024-03-26,-2.7328,0.1888,-2.9216


In [15]:
aapl_ns.shape

(683, 3)

In [35]:
aapl = aapl_tsd.merge(aapl_rsi, left_index=True, right_index=True) \
    .merge(aapl_macd, left_index=True, right_index=True)

In [39]:
aapl.shape

(6100, 14)

In [36]:
aapl = generate_features(aapl)

In [38]:
export_csv(aapl, 'aapl')

Exported your DataFrame to '/Users/annandvirk/code/avirklol/projects_pub/stock_predictor/data/2024-03-26_aapl.csv'!


## Second Set

In [40]:
msft_tsd, msft_rsi, msft_ns, msft_macd = alpha_multicall(function_set, 'msft', 'csv')

In [45]:
msft_tsd.head()

Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1999-11-01,93.25,94.19,92.12,92.37,26630600
1999-11-02,92.75,94.5,91.94,92.56,23174500
1999-11-03,92.94,93.5,91.5,92.0,22258500
1999-11-04,92.31,92.75,90.31,91.75,27119700
1999-11-05,91.81,92.87,90.5,91.56,35083700


In [42]:
msft = msft_tsd.merge(msft_rsi, left_index=True, right_index=True) \
    .merge(msft_macd, left_index=True, right_index=True)

In [43]:
msft = generate_features(msft)

In [49]:
msft.head()

Unnamed: 0_level_0,open,high,low,close,lag_1,lag_2,rolling_mean_7,rolling_std_7,daily_return,volume,RSI,MACD,MACD_Hist,MACD_Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1999-12-28,118.75,118.81,117.06,117.5,119.12,117.44,116.498571,2.077398,-1.359973,12295500,75.2622,2.3195,0.5222,1.7973
1999-12-29,116.94,118.37,116.81,117.94,117.5,119.12,116.882857,2.056637,0.374468,8724600,75.6687,2.3114,0.4113,1.9001
1999-12-30,117.87,119.94,117.12,117.62,117.94,117.5,117.578571,0.953352,-0.271324,11180000,74.7072,2.2707,0.2965,1.9742
1999-12-31,117.5,117.75,116.25,116.75,117.62,117.94,117.704286,0.71995,-0.73967,6258800,72.0276,2.1915,0.1738,2.0177
2000-01-03,117.37,118.62,112.0,116.56,116.75,117.62,117.561429,0.842188,-0.162741,26614200,71.425,2.0997,0.0657,2.0341


In [50]:
msft.shape

(6100, 14)

In [47]:
export_csv(msft, 'msft')

Exported your DataFrame to '/Users/annandvirk/code/avirklol/projects_pub/stock_predictor/data/2024-03-26_msft.csv'!
