In [1]:
# Import reqired libraries
import numpy as np
import yfinance as yf
import pandas as pd
import os

In [2]:
# List of diverse stock symbols and ETFs
# AAPL: Apple Inc.
# MSFT: Microsoft Corporation
# GOOGL: Alphabet Inc. (Google)
# AMZN: Amazon.com Inc.
# TSLA: Tesla Inc.
# SPY: SPDR S&P 500 ETF Trust (an ETF that tracks the S&P 500)
# GLD: SPDR Gold Shares (an ETF that tracks the price of gold)
# BTC-USD: Bitcoin in USD (Cryptocurrency)
assets = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'SPY', 'GLD', 'BTC-USD']

# Define the time period
start_date = '2000-01-01'
end_date = '2023-01-01'

In [3]:
# Create a directory to save the data if it doesn't exist
os.makedirs('../data', exist_ok=True)

1. **Daily Returns**:
   $R_t = \frac{P_t - P_{t-1}}{P_{t-1}}$

   where $R_t$ is the daily return, $P_t$ is the adjusted closing price at time $t$, and $P_{t-1}$ is the adjusted closing price at time $t-1$.

2. **Moving Averages**:
   - 20-day Moving Average (MA20):
     $MA_{20} = \frac{1}{20} \sum_{i=0}^{19} P_{t-i}$

   - 50-day Moving Average (MA50):
     $MA_{50} = \frac{1}{50} \sum_{i=0}^{49} P_{t-i}$

3. **Volatility**:
   - 20-day Volatility (standard deviation of daily returns):
     $\sigma_{20} = \sqrt{\frac{1}{20} \sum_{i=0}^{19} (R_{t-i} - \mu)^2}$

     where $\sigma_{20}$ is the 20-day volatility, $R_{t-i}$ is the daily return at time $t-i$, and $\mu$ is the mean daily return over the past 20 days.


In [4]:
def add_features(df):
  """
  Adds additional features to the stock data DataFrame.
  Parameters --> : df (DataFrame) containing stock data.
  Returns --> DataFrame with additional features.
  """
  # Calculate daily returns as the percentage change of the adjusted close price
  df['Daily Return'] = df['Adj Close'].pct_change()
  
  # Calculate the 20-day moving average of the adjusted close price
  df['MA20'] = df['Adj Close'].rolling(window=20).mean()
  
  # Calculate the 50-day moving average of the adjusted close price
  df['MA50'] = df['Adj Close'].rolling(window=50).mean()

  # Exponential Moving Averages
  df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
  df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()

  # Relative Strength Index (RSI)
  delta = df['Close'].diff(1)
  gain = delta.where(delta > 0, 0)
  loss = -delta.where(delta < 0, 0)
  avg_gain = gain.rolling(window=14).mean()
  avg_loss = loss.rolling(window=14).mean()
  rs = avg_gain / avg_loss
  df['RSI'] = 100 - (100 / (1 + rs))

  # Moving Average Convergence Divergence (MACD)
  df['MACD'] = df['EMA20'] - df['EMA50']
  df['Signal Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
  
  # Calculate the volatility (standard deviation of daily returns) over a 20-day window
  df['Volatility'] = df['Daily Return'].rolling(window=20).std()

  # Bollinger Bands
  df['Upper Band'] = df['MA20'] + (df['Adj Close'].rolling(window=20).std() * 2)
  df['Lower Band'] = df['MA20'] - (df['Adj Close'].rolling(window=20).std() * 2)
    
  # Lagged features
  df['Lag_1'] = df['Adj Close'].shift(1)
  df['Lag_2'] = df['Adj Close'].shift(2)
  df['Lag_3'] = df['Adj Close'].shift(3)
  
  # Drop any rows with NaN values resulting from the calculations above
  df.dropna(inplace=True)
  
  return df

In [5]:
# Download historical stock data for the given symbol and time period
for asset in assets:
  stock_data = yf.download(asset, start=start_date, end=end_date)
  
  # Save the raw data to a CSV file
  stock_data.to_csv(f'../data/{asset}_historical_data.csv')
  
  # Add features
  stock_data = add_features(stock_data)
  
  # Save the enhanced data to a new CSV file
  stock_data.to_csv(f'../data/{asset}_enhanced_data.csv')
  
  # Display the first few rows of the enhanced data
  print(f"Enhanced data for {asset}:")
  print(stock_data.head())
  print("\n")


[*********************100%%**********************]  1 of 1 completed


Enhanced data for AAPL:
                Open      High       Low     Close  Adj Close     Volume  \
Date                                                                       
2000-03-14  1.082310  1.109375  1.017857  1.020089   0.862437  428579200   
2000-03-15  1.032366  1.073661  1.018973  1.037946   0.877534  443609600   
2000-03-16  1.047433  1.089286  1.022321  1.085379   0.917636  378100800   
2000-03-17  1.072545  1.116071  1.068080  1.116071   0.943586  305043200   
2000-03-20  1.102679  1.127232  1.092634  1.098214   0.928488  204489600   

            Daily Return      MA20      MA50     EMA20     EMA50        RSI  \
Date                                                                          
2000-03-14     -0.058217  0.897238  0.835547  1.059902  1.021796  48.370683   
2000-03-15      0.017505  0.896200  0.836199  1.057811  1.022429  50.839800   
2000-03-16      0.045699  0.899007  0.839076  1.060436  1.024898  58.905469   
2000-03-17      0.028278  0.902829  0.842247  1.

[*********************100%%**********************]  1 of 1 completed


Enhanced data for MSFT:
                Open      High       Low    Close  Adj Close    Volume  \
Date                                                                     
2000-03-14  49.31250  49.62500  47.56250  47.5625  29.379612  73489200   
2000-03-15  47.28125  48.31250  46.84375  47.6875  29.456816  53208000   
2000-03-16  47.96875  48.34375  46.62500  47.6875  29.456816  77300800   
2000-03-17  47.62500  49.75000  47.25000  49.6875  30.692230  81161600   
2000-03-20  49.37500  49.87500  48.25000  48.6875  30.074514  47773000   

            Daily Return       MA20       MA50      EMA20      EMA50  \
Date                                                                   
2000-03-14     -0.029337  29.332311  31.390622  48.299270  50.541916   
2000-03-15      0.002628  29.283088  31.259745  48.241006  50.429978   
2000-03-16      0.000000  29.248342  31.153191  48.188291  50.322430   
2000-03-17      0.041940  29.244481  31.064010  48.331073  50.297531   
2000-03-20     -0.020126 

[*********************100%%**********************]  1 of 1 completed


Enhanced data for GOOGL:
                Open      High       Low     Close  Adj Close     Volume  \
Date                                                                       
2004-10-28  4.671672  4.864615  4.644645  4.837337   4.831791  593278128   
2004-10-29  4.977227  5.003754  4.769770  4.770771   4.765302  845653500   
2004-11-01  4.843594  4.946697  4.786537  4.905656   4.900032  488507004   
2004-11-02  4.974474  4.986236  4.838338  4.876627   4.871037  453398148   
2004-11-03  4.959459  5.045045  4.773524  4.796547   4.791049  554992452   

            Daily Return      MA20      MA50     EMA20     EMA50        RSI  \
Date                                                                          
2004-10-28      0.039415  3.783458  3.175705  3.894157  3.374084  81.201562   
2004-10-29     -0.013761  3.856022  3.220848  3.977644  3.428856  81.028702   
2004-11-01      0.028273  3.932224  3.264702  4.066026  3.486770  81.695338   
2004-11-02     -0.005917  4.002838  3.307430  4

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Enhanced data for AMZN:
                Open      High      Low     Close  Adj Close     Volume  \
Date                                                                      
2000-03-14  3.303125  3.346875  3.16875  3.281250   3.281250   96106000   
2000-03-15  3.237500  3.256250  3.10000  3.187500   3.187500  121654000   
2000-03-16  3.112500  3.318750  3.00000  3.312500   3.312500  194062000   
2000-03-17  3.303125  3.318750  3.22500  3.240625   3.240625  105970000   
2000-03-20  3.175000  3.225000  3.11875  3.209375   3.209375  116270000   

            Daily Return      MA20      MA50     EMA20     EMA50        RSI  \
Date                                                                          
2000-03-14      0.004785  3.332656  3.458875  3.352678  3.555962  40.941180   
2000-03-15     -0.028571  3.307500  3.433250  3.336947  3.541513  41.134752   
2000-03-16      0.039216  3.296406  3.417562  3.334619  3.532532  44.911507   
2000-03-17     -0.021698  3.285937  3.412625  3.325667 




Enhanced data for TSLA:
                Open      High       Low     Close  Adj Close   Volume  \
Date                                                                     
2010-09-08  1.377333  1.396667  1.373333  1.393333   1.393333  4326000   
2010-09-09  1.400000  1.403333  1.379333  1.380667   1.380667  5643000   
2010-09-10  1.383333  1.395333  1.317333  1.344667   1.344667  5799000   
2010-09-13  1.392667  1.393333  1.366667  1.381333   1.381333  5412000   
2010-09-14  1.369333  1.440000  1.368667  1.408000   1.408000  9820500   

            Daily Return      MA20      MA50     EMA20     EMA50        RSI  \
Date                                                                          
2010-09-08      0.017527  1.301467  1.322240  1.337084  1.361035  67.149673   
2010-09-09     -0.009090  1.310833  1.318000  1.341234  1.361805  65.047002   
2010-09-10     -0.026074  1.319400  1.313120  1.341561  1.361133  58.093849   
2010-09-13      0.027268  1.327400  1.311467  1.345349  1.3619

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Enhanced data for SPY:
                 Open       High        Low      Close  Adj Close    Volume  \
Date                                                                          
2000-03-14  139.28125  140.09375  136.15625  136.62500  87.637436   8263900   
2000-03-15  136.87500  140.43750  136.06250  139.81250  89.682022  10300800   
2000-03-16  141.62500  146.84375  140.87500  146.34375  93.871506  25601400   
2000-03-17  145.81250  148.00000  145.43750  146.93750  94.491852  10272900   
2000-03-20  146.87500  147.34375  144.78125  146.18750  94.009605  12502300   

            Daily Return       MA20       MA50       EMA20       EMA50  \
Date                                                                     
2000-03-14     -0.014205  88.323501  90.118433  138.537775  140.171821   
2000-03-15      0.023330  88.282907  90.046270  138.659177  140.157730   
2000-03-16      0.046715  88.518438  90.130860  139.391041  140.400319   
2000-03-17      0.006608  88.808037  90.224651  140.1


[*********************100%%**********************]  1 of 1 completed

Enhanced data for GLD:
                 Open       High        Low      Close  Adj Close   Volume  \
Date                                                                         
2005-01-31  42.209999  42.299999  41.959999  42.220001  42.220001  1692400   
2005-02-01  42.090000  42.139999  41.950001  42.099998  42.099998  1088900   
2005-02-02  42.220001  42.230000  42.020000  42.160000  42.160000   956500   
2005-02-03  41.560001  41.740002  41.500000  41.680000  41.680000  2446400   
2005-02-04  41.590000  41.650002  41.389999  41.470001  41.470001  1819400   

            Daily Return     MA20     MA50      EMA20      EMA50        RSI  \
Date                                                                          
2005-01-31     -0.011010  42.4450  43.6474  42.693424  43.296900  54.340858   
2005-02-01     -0.002842  42.3990  43.6018  42.636907  43.249963  48.148134   
2005-02-02      0.001425  42.3700  43.5494  42.591487  43.207219  41.666679   
2005-02-03     -0.011385  42.3205  


