# Import packages and API Setup

## Import packages

In [1]:
# Data manipulation
import datetime as dt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) # Remove column display limit
import pandas_datareader.data as web

# Stats
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Fetch
import requests
import yfinance as yf


## API setup and configuration 

### Alpha Vantage

[Alpha Vantage API Documentation](https://www.alphavantage.co/documentation/#)

In [2]:
# Import dotenv_values and load the API key from .env file
from dotenv import dotenv_values 
secrets = dotenv_values("secrets/.env")
alpha_key = secrets["alphavantage_key"]

#### Search Endpoint

[Documentation](https://www.alphavantage.co/documentation/#symbolsearch)  

The Search Endpoint returns the best-matching symbols and market information based on keywords of your choice. The search results also contain match scores that provide you with the full flexibility to develop your own search and filtering logic.  

In [3]:
# Function to search for symbols based on a keyword
def av_search(keyword):
    # Base URL
    base_url = "https://www.alphavantage.co/query?function=SYMBOL_SEARCH&keywords={keyword}&apikey={apikey}"

    # API key for authentication
    api_key = alpha_key

    # Construct the full URL
    url = base_url.format(keyword=keyword, apikey=api_key)

    # Send HTTP GET request to fetch data
    response = requests.get(url)
    
    # Parse the response as JSON
    response_json = response.json()
    
    # Create a DataFrame from the 'bestMatches' section of the JSON response
    df_results = pd.DataFrame(response_json.get('bestMatches', []))

    return df_results

#### Time Series Daily

[Documentation](https://www.alphavantage.co/documentation/#daily)  

This API returns raw (as-traded) daily time series (date, daily open, daily high, daily low, daily close, daily volume) of the global equity specified, covering 20+ years of historical data. The OHLCV data is sometimes called "candles" in finance literature. 

In [4]:
# Function to fetch daily stock data
def av_daily(symbol, output_size):
    # Base URL
    base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}&outputsize={output_size}&apikey={apikey}"

    # API key for authentication
    api_key = alpha_key

    # Construct the full URL
    url = base_url.format(symbol=symbol, output_size=output_size, apikey=api_key)

    # Send HTTP GET request to fetch data
    response = requests.get(url)

    # Parse the response as JSON
    data = response.json()

    # Extract the daily time series data, with a default to an empty dictionary
    time_series = data.get("Time Series (Daily)", {})

    # Create a DataFrame from the time series data
    ts_df = pd.DataFrame.from_dict(time_series, orient='index')

    # Convert the index to datetime for easier manipulation and analysis
    ts_df.index = pd.to_datetime(ts_df.index)

    # Return the resulting DataFrame
    return ts_df

#### Market News & Sentiment

[Documentation](https://www.alphavantage.co/documentation/#news-sentiment)

This API returns live and historical market news & sentiment data from a large & growing selection of premier news outlets around the world, covering stocks, cryptocurrencies, forex, and a wide range of topics such as fiscal policy, mergers & acquisitions, IPOs, etc.


The stock/crypto/forex symbols of your choice. For example: tickers=IBM will filter for articles that mention the IBM ticker; tickers=COIN,CRYPTO:BTC,FOREX:USD will filter for articles that simultaneously mention Coinbase (COIN), Bitcoin (CRYPTO:BTC), and US Dollar (FOREX:USD) in their content.

In [5]:
# Function to fetch news sentiment data
def av_news(tickers):
    # Base URL 
    base_url = "https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers={tickers}&apikey={apikey}"

    # API key for authentication
    api_key = alpha_key

    # Construct the full URL
    url = base_url.format(tickers=tickers, apikey=api_key)

    # Send HTTP GET request to fetch data
    response = requests.get(url)

    # Parse the response as JSON
    data = response.json()

    # Normalize the 'feed' data into a DataFrame
    feed_df = pd.json_normalize(data['feed'])

    # Return the DataFrame with the news sentiment data
    return feed_df

# Fetching data

Because of the 25-query daily limit with Alpha Vantage's free API key, I'll save the results to a CSV file and comment out the code that fetches the data.

## Standard and Poor's 500 (S&P 500)
**Ticker = "SPY"**

In [23]:
# # Fetch S&P 500 Data from Alpha Vantage API
# df_spy = av_daily('spy', 'full')

# df_spy.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2024-04-25,499.18,504.27,497.49,503.49,69122368
2024-04-24,506.56,507.37,503.13,505.41,55928076
2024-04-23,501.78,506.09,499.5328,505.65,64633620
2024-04-22,497.83,502.38,495.43,499.72,67961048
2024-04-19,499.44,500.455,493.86,495.16,102212587


In [27]:
# Save fetched data to csv
#df_spy.to_csv("data/df_spy.csv")

In [33]:
# Load saved data
df_spy = pd.read_csv("data/df_spy.csv", index_col=0)

# Rename columns 
df_spy.columns = [col[3:] for col in df_spy.columns]

df_spy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6160 entries, 2024-04-25 to 1999-11-01
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    6160 non-null   float64
 1   high    6160 non-null   float64
 2   low     6160 non-null   float64
 3   close   6160 non-null   float64
 4   volume  6160 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 288.8+ KB


# Building unsupervised learning trading strategy

Calculate the features and technical indicators of each stock
* Garman-Klass Volatility
* RSI
* Bollinger Bands
* ATR
* MACD
* Dollar Volume

## Garman-Klass Volatility

The Garman-Klass volatility formula estimates the annualized volatility of a financial asset using high, low, open, and close prices. It is considered an improvement over simple close-to-close volatility estimation because it incorporates additional information from the intraday price range.

The formula to calculate the Garman-Klass volatility is:

$$
\sigma_{GK} = \sqrt{ \frac{1}{n} \sum_{i=1}^n \left( \frac{1}{2} \ln\left( \frac{H_i}{L_i} \right)^2 - (2 \ln(2) - 1) \ln\left( \frac{C_i}{O_i} \right)^2 \right)}
$$


where:

- $n$ is the number of periods (e.g., trading days),
- $O_i$ represents the opening price for period $i$,
- $C_i$ represents the closing price for period $i$,
- $H_i$ represents the highest price during period $i$,
- $L_i$ represents the lowest price during period $i$.


### Explanation

- **Logarithmic Returns**: The formula uses logarithmic returns to estimate the volatility derived from the high-low price range and the open-close price difference.

- **High-Low Component**: The term $ \frac{1}{2} \ln\left( \frac{H_i}{L_i} \right)^2 $ captures the volatility due to the price range within a single period.

- **Close-Open Component**: The term $ (2 \ln(2) - 1) \ln\left( \frac{C_i}{O_i} \right)^2 $ accounts for the volatility derived from the difference between the closing and opening prices.

In [40]:
# Define the Garman-Klass function
def garman_klass(data):
    # Calculate the high-low component
    high_low = 0.5 * np.square(np.log(data['high'] / data['low']))
    # Calculate the open-close component
    open_close = (2 * np.log(2) - 1) * np.square(np.log(data['close'] / data['open']))
    # Calculate Garman-Klass volatility
    gk_volatility = np.sqrt(high_low - open_close)
    
    # Add the calculated volatility as a new column to the DataFrame
    data['gk_volatility'] = gk_volatility
    
    # Return the modified DataFrame with the new column
    return data

In [38]:
df_spy = garman_klass(df_spy)

In [39]:
df_spy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6160 entries, 2024-04-25 to 1999-11-01
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   open           6160 non-null   float64
 1   high           6160 non-null   float64
 2   low            6160 non-null   float64
 3   close          6160 non-null   float64
 4   volume         6160 non-null   int64  
 5   gk_volatility  6160 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 336.9+ KB
