In [2]:
from dotenv import load_dotenv
import os
import requests
import json
import gzip
import pandas as pd
import numpy as np
import glob
from datetime import date, time

In [4]:
load_dotenv()
path = '../data_files/'
years = ['2020', '2021', '2022', '2023', '2024', '2025']
mm=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
params = {'function': 'TIME_SERIES_INTRADAY',
          'symbol': 'NEE',
          'interval': '1min',
          'month': '2025-01',
          'outputsize': 'full',
          'extended_hours': 'false',
          'apikey': os.getenv("API_KEY")
          }

In [5]:
symbols = pd.read_csv(f"{path}sp500_companies.csv")["Symbol"]
stocks = list(symbols)

In [4]:
def get_stock_data(years, stock, dir_path, start_new_csv):
    for year in years:
        if year == '2025':
            mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
        else:
            mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
            # mm = ['01', '02']
        for m in mm:
            # print(m)
            params['month']=year+"-"+m
            url = "https://www.alphavantage.co/query?"
            for key, val in params.items():
                url+=key
                url+='='
                url+=val
                url+='&'
            url = url[:-1]
            # print(url)
            r = requests.get(url)
            data = r.json()
            # print(data)
            if "Error Message" in data:
                print(f"Error {stock}")
                os.rmdir(path+params['symbol'])
                with open(f'{path}s&p500_lost.txt', 'a') as file:
                    file.write(f"{params["symbol"], }")
                return
            df = pd.DataFrame.from_dict(data['Time Series (1min)'], orient='index')
            df = df.sort_index()
            # print(df.index)
            if start_new_csv:
                df1 = df
            else:
                df1 = pd.concat([df1, df], ignore_index=False)
            start_new_csv = False
    df1.to_csv(f"{dir_path + params['symbol']}_combined.csv.gz", compression='gzip', index=True)

In [5]:
#Get all returns data
for stock in stocks:
    start_new_csv = True
    print(stock)
    params['symbol'] = stock
    os.mkdir(path+params['symbol'])
    dir_path = path + f"{params['symbol']}/"
    get_stock_data(years, stock, dir_path, start_new_csv)

AAPL


FileExistsError: [Errno 17] File exists: '../data_files/AAPL'

In [6]:
with open(f'{path}s&p500_lost.txt', 'r') as f:
    num = f.readline().split(', ')
    missing_stocks = [x.replace('\'', '') for x in num]
    

In [7]:
missing_stocks

['PLTR',
 'FI',
 'GEV',
 'ABNB',
 'CEG',
 'CARR',
 'COR',
 'DFS',
 'KVUE',
 'HES',
 'OTIS',
 'GEHC',
 'ANSS',
 'SW',
 'WBD',
 'VLTO',
 'CPAY',
 'BF-B',
 'EG',
 'VTRS',
 'RVTY',
 'JNPR',
 'DAY',
 'SOLV',
 'WBA',
 'PARA',
 'AMTM']

In [8]:
stocks = [x for x in stocks if x not in missing_stocks]

In [14]:
len(stocks)

475

In [13]:
pd.DataFrame(stocks).to_csv("stocks_with_data.csv")

In [40]:
#Get daily volatility as target
#get moving averages for 1, 5, 22, and 252 days
MA = ["1", "5", "22", "44"]
for stock in stocks:
    dir_path = f"{path + stock}/"
    df = pd.read_csv(f"{dir_path+stock}_combined.csv.gz")
    
    returns = pd.DataFrame({
    "time_stamp": df['Unnamed: 0'],
    "log_return": np.log(1+(df['4. close']-df['1. open'])/df['1. open']),
    "volume": np.log(df['5. volume'])
    })
    returns['time_stamp'] = pd.to_datetime(returns['time_stamp'], errors='coerce')

    #volatility
    # daily_volatility = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].std()
    # volatility_df = pd.DataFrame(daily_volatility).rename(columns={'log_return': 'volatility'})
    # volatility_df.to_csv(f"{path + stock}_DAILY_VOLATILITY.csv.gz", compression='gzip')

    #volatility moving average
    # vol_moving_average_df = pd.DataFrame()

    # for window in MA:
    #     column_name = f'MA_{window}'
    #     vol_moving_average_df[column_name] = volatility_df['volatility'].shift(1).rolling(window=int(window)).mean()

    # vol_moving_average_df.to_csv(f"{path + stock}_MOVING_AVERAGE.csv.gz", compression='gzip')

    # #log returns moving average
    # daily_returns = pd.Da

    moving_average_types = ['volatility', 'log_return', 'volume']
    for type in moving_average_types:
        if type == "volatility":
            daily_value = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].std()
            df = pd.DataFrame(daily_value).rename(columns={'log_return': 'volatility'})
            df.to_csv(f"{dir_path + stock}_DAILY_VOLATILITY.csv.gz", compression='gzip')
            
        else:
            daily_value = returns.groupby(returns['time_stamp'].dt.floor("1D"))[type].sum()
            df = pd.DataFrame(daily_value).rename(columns={'log_return': type})
        moving_average_df = pd.DataFrame()
        for window in MA:
            column_name = f'MA_{window}_{type}'
            moving_average_df[column_name] = df[type].shift(1).rolling(window=int(window)).mean()
        moving_average_df.to_csv(f"{dir_path+stock}_{type}_MOVING_AVERAGE.csv.gz", compression='gzip')

In [9]:
secondary_metrics = ["EARNINGS", "INCOME_STATEMENT", "BALANCE_SHEET", "CASH_FLOW", "SHARES_OUTSTANDING"]

In [10]:
def year_range(df, time_name, metric):
    df[time_name] = pd.to_datetime(df[time_name])
    if metric == "EARNINGS":
        df = df[df[time_name].dt.year > 2019]
    if metric == "SHARES_OUTSTANDING":
        df = df[df[time_name].dt.date < date(2025, 6, 1)]
    return df

In [11]:
def get_NA_columns(df, na_column_set):
    na_columns = df.columns[df.isna().any()].tolist()
    na_column_set.update(na_columns)
    return na_column_set

In [30]:
import time
def safe_get(url, retries=5):
    time.sleep(1)
    for i in range(retries):
        r = requests.get(url)
        if r.status_code == 200 and len(r.text)>5:
            return r
        time.sleep(0.5)
    return r

In [26]:
url = f'https://www.alphavantage.co/query?function=SHARES_OUTSTANDING&symbol=OIL&apikey={os.environ["API_KEY"]}'
# print(url)
r = safe_get(url)
# print(r)
data = r.json()
data

{}

In [43]:
def get_secondary_metric(stock, secondary_metrics, invalid_requests):
    dir_path = f"{path + stock}/"
    for metric in secondary_metrics:
        # print(metric)
        url = f'https://www.alphavantage.co/query?function={metric}&symbol={stock}&apikey={os.environ["API_KEY"]}'
        # print(url)
        r = safe_get(url)
        # print(r)
        data = r.json()
        print(data)
        if ("status" in data.keys() and data["status"] == 'invalid request'):
            invalid_requests.append(stock)
            return
        if metric == "EARNINGS":
            df = year_range(pd.DataFrame(data['quarterlyEarnings']), 'reportedDate', metric)
            df = df[['reportedDate', 'surprise', 'surprisePercentage']]
            df = df.rename(columns={'reportedDate': 'time_stamp'})
        elif metric == "SHARES_OUTSTANDING":
            df = year_range(pd.DataFrame(data['data']), 'date', metric)
            df = df.rename(columns={'date': 'time_stamp'})
        else:
            df = year_range(pd.DataFrame(data['annualReports']), 'fiscalDateEnding', metric)
            df = df.rename(columns={'fiscalDateEnding': 'time_stamp'})

        prefix = f"{metric}: "
        df.columns = [
            f"{prefix}{col}" if col != "time_stamp" else col
            for col in df.columns
        ]
        df = df.replace(["None", "null", "NA", "NaN", ""], pd.NA)

        df.to_csv(f"{dir_path + stock}_{metric}.csv.gz", compression='gzip')

        # na_column_set = get_NA_columns(df, na_column_set)
        # key_name = f"{stock}: {metric}"
        # secondary_dfs[key_name] = df        


In [44]:
requested_stock = []
invalid_requests = []
for stock in stocks:
    print(stock)
    requested_stock.append(stock)
    get_secondary_metric(stock, secondary_metrics, invalid_requests)
    # print(stock)
    # for metric in secondary_metrics:
    #     # print(metric)
    #     url = f'https://www.alphavantage.co/query?function={metric}&symbol={stock}&apikey={os.environ["API_KEY"]}'
    #     # print(url)
    #     r = requests.get(url)
    #     # print(r)
    #     data = r.json()
    #     # print(data)
    #     if data['status']=='invalid request':
    #         invalid_requests.append(stock)
            
    #     if metric == "EARNINGS":
    #         df = year_range(pd.DataFrame(data['quarterlyEarnings']), 'reportedDate', metric)
    #         df = df[['reportedDate', 'surprise', 'surprisePercentage']]
    #         df = df.rename(columns={'reportedDate': 'time_stamp'})
    #     elif metric == "SHARES_OUTSTANDING":
    #         df = year_range(pd.DataFrame(data['data']), 'date', metric)
    #         df = df.rename(columns={'date': 'time_stamp'})
    #     else:
    #         df = year_range(pd.DataFrame(data['annualReports']), 'fiscalDateEnding', metric)
    #         df = df.rename(columns={'fiscalDateEnding': 'time_stamp'})

    #     prefix = f"{metric}: "
    #     df.columns = [
    #         f"{prefix}{col}" if col != "time_stamp" else col
    #         for col in df.columns
    #     ]
    #     df = df.replace(["None", "null", "NA", "NaN", ""], pd.NA)

    #     na_column_set = get_NA_columns(df, na_column_set)
    #     key_name = f"{stock}: {metric}"
    #     secondary_dfs[key_name] = df        


BWA
{'symbol': 'BWA', 'annualEarnings': [{'fiscalDateEnding': '2025-09-30', 'reportedEPS': '3.56'}, {'fiscalDateEnding': '2024-12-31', 'reportedEPS': '1.46'}, {'fiscalDateEnding': '2023-12-31', 'reportedEPS': '4.32'}, {'fiscalDateEnding': '2022-12-31', 'reportedEPS': '4.6'}, {'fiscalDateEnding': '2021-12-31', 'reportedEPS': '4.15'}, {'fiscalDateEnding': '2020-12-31', 'reportedEPS': '2.69'}, {'fiscalDateEnding': '2019-12-31', 'reportedEPS': '4.13'}, {'fiscalDateEnding': '2018-12-31', 'reportedEPS': '4.49'}, {'fiscalDateEnding': '2017-12-31', 'reportedEPS': '3.89'}, {'fiscalDateEnding': '2016-12-31', 'reportedEPS': '3.27'}, {'fiscalDateEnding': '2015-12-31', 'reportedEPS': '3.01'}, {'fiscalDateEnding': '2014-12-31', 'reportedEPS': '3.26'}, {'fiscalDateEnding': '2013-12-31', 'reportedEPS': '2.89'}, {'fiscalDateEnding': '2012-12-31', 'reportedEPS': '2.5'}, {'fiscalDateEnding': '2011-12-31', 'reportedEPS': '2.23'}, {'fiscalDateEnding': '2010-12-31', 'reportedEPS': '1.53'}, {'fiscalDateEndin

In [39]:
# requested_stock.remove("BWA")
requested_stock

['AAPL',
 'NVDA',
 'MSFT',
 'AMZN',
 'GOOGL',
 'GOOG',
 'META',
 'TSLA',
 'AVGO',
 'BRK-B',
 'WMT',
 'LLY',
 'JPM',
 'V',
 'MA',
 'ORCL',
 'XOM',
 'UNH',
 'COST',
 'PG',
 'HD',
 'NFLX',
 'JNJ',
 'BAC',
 'CRM',
 'ABBV',
 'KO',
 'TMUS',
 'CVX',
 'MRK',
 'WFC',
 'CSCO',
 'ACN',
 'NOW',
 'AXP',
 'MCD',
 'PEP',
 'BX',
 'IBM',
 'DIS',
 'LIN',
 'TMO',
 'MS',
 'ABT',
 'ADBE',
 'AMD',
 'PM',
 'ISRG',
 'GE',
 'INTU',
 'GS',
 'CAT',
 'TXN',
 'QCOM',
 'VZ',
 'BKNG',
 'DHR',
 'T',
 'BLK',
 'RTX',
 'SPGI',
 'PFE',
 'HON',
 'NEE',
 'CMCSA',
 'ANET',
 'AMGN',
 'PGR',
 'LOW',
 'SYK',
 'UNP',
 'TJX',
 'KKR',
 'SCHW',
 'ETN',
 'AMAT',
 'BA',
 'BSX',
 'C',
 'UBER',
 'COP',
 'PANW',
 'ADP',
 'DE',
 'BMY',
 'LMT',
 'GILD',
 'NKE',
 'CB',
 'UPS',
 'ADI',
 'MMC',
 'MDT',
 'VRTX',
 'MU',
 'SBUX',
 'PLD',
 'LRCX',
 'MO',
 'SO',
 'EQIX',
 'CRWD',
 'PYPL',
 'SHW',
 'ICE',
 'CME',
 'AMT',
 'APH',
 'ELV',
 'TT',
 'MCO',
 'CMG',
 'INTC',
 'KLAC',
 'DUK',
 'PH',
 'CDNS',
 'WM',
 'DELL',
 'MDLZ',
 'MAR',
 'MSI',
 'WEL

In [40]:
stocks = [x for x in stocks if x not in requested_stock]
stocks

['BWA', 'QRVO', 'FMC']

In [42]:
for i in stocks:
    dir_path = f"{path+i}/"
    for metric in secondary_metrics:
        file = f"{dir_path+i}_{metric}.csv.gz"
        print(file)
        if os.path.exists(file):
            print(file)
            os.remove(file)

../data_files/BWA/BWA_EARNINGS.csv.gz
../data_files/BWA/BWA_EARNINGS.csv.gz
../data_files/BWA/BWA_INCOME_STATEMENT.csv.gz
../data_files/BWA/BWA_INCOME_STATEMENT.csv.gz
../data_files/BWA/BWA_BALANCE_SHEET.csv.gz
../data_files/BWA/BWA_BALANCE_SHEET.csv.gz
../data_files/BWA/BWA_CASH_FLOW.csv.gz
../data_files/BWA/BWA_SHARES_OUTSTANDING.csv.gz
../data_files/QRVO/QRVO_EARNINGS.csv.gz
../data_files/QRVO/QRVO_INCOME_STATEMENT.csv.gz
../data_files/QRVO/QRVO_BALANCE_SHEET.csv.gz
../data_files/QRVO/QRVO_CASH_FLOW.csv.gz
../data_files/QRVO/QRVO_SHARES_OUTSTANDING.csv.gz
../data_files/FMC/FMC_EARNINGS.csv.gz
../data_files/FMC/FMC_INCOME_STATEMENT.csv.gz
../data_files/FMC/FMC_BALANCE_SHEET.csv.gz
../data_files/FMC/FMC_CASH_FLOW.csv.gz
../data_files/FMC/FMC_SHARES_OUTSTANDING.csv.gz


In [29]:
na_column_set

{'BALANCE_SHEET: accumulatedDepreciationAmortizationPPE',
 'BALANCE_SHEET: capitalLeaseObligations',
 'BALANCE_SHEET: cashAndCashEquivalentsAtCarryingValue',
 'BALANCE_SHEET: cashAndShortTermInvestments',
 'BALANCE_SHEET: commonStock',
 'BALANCE_SHEET: currentAccountsPayable',
 'BALANCE_SHEET: currentDebt',
 'BALANCE_SHEET: currentLongTermDebt',
 'BALANCE_SHEET: currentNetReceivables',
 'BALANCE_SHEET: deferredRevenue',
 'BALANCE_SHEET: goodwill',
 'BALANCE_SHEET: intangibleAssets',
 'BALANCE_SHEET: intangibleAssetsExcludingGoodwill',
 'BALANCE_SHEET: inventory',
 'BALANCE_SHEET: investments',
 'BALANCE_SHEET: longTermDebt',
 'BALANCE_SHEET: longTermDebtNoncurrent',
 'BALANCE_SHEET: longTermInvestments',
 'BALANCE_SHEET: otherCurrentAssets',
 'BALANCE_SHEET: otherCurrentLiabilities',
 'BALANCE_SHEET: otherNonCurrentAssets',
 'BALANCE_SHEET: otherNonCurrentLiabilities',
 'BALANCE_SHEET: propertyPlantEquipment',
 'BALANCE_SHEET: reportedCurrency',
 'BALANCE_SHEET: retainedEarnings',
 'BA

In [None]:
na_column_list = na_column_set

for stock in stocks:
    for metric in secondary_metrics:
        key_name = f"{stock}: {metric}"
        sublist = [s for s in na_column_list if metric in s]
        df = secondary_dfs[key_name]
        df = df.drop(columns=list(sublist), axis=1)
        df.to_csv(f"{path + stock}_{metric}.csv.gz", compression='gzip')

In [None]:
file_path = path+'calendar.txt'
cal_dict = {}
try:
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, 1):
            value = line.strip()
            if value[0]!='2':
                key=value
                cal_dict[key] = []
            else:
                yy=value[:4]
                mm=value[4:6]
                dd=value[6:]
                cal_dict[key].append(yy+'-'+mm+'-'+dd)
            print(f"Line {line_number}: {line.strip()}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
for key in list(cal_dict.keys()):
    df = pd.DataFrame(cal_dict[key]).rename(columns={0: "time_stamp"})
    df.to_csv(f"{path+key}_CALDATES.csv.gz", compression="gzip")
