In [None]:
import os
import requests
import json
import gzip
import pandas as pd
import numpy as np
import glob
from datetime import date, time

In [None]:
path = '../data_files/'
stocks = ['LLY', 'AAPL', 'NEE']
years = ['2020', '2021', '2022', '2023', '2024', '2025']
mm=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
params = {'function': 'TIME_SERIES_INTRADAY',
          'symbol': 'NEE',
          'interval': '1min',
          'month': '2025-01',
          'outputsize': 'full',
          'extended_hours': 'false',
          'apikey': os.environ["API_KEY"]
          }

In [None]:
#Get all returns data
for stock in stocks:
    start_new_csv = True
    params['symbol'] = stock
    for year in years:
        if year == '2025':
            mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
        else:
            mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
        for m in mm:
            # print(m)
            params['month']=year+"-"+m
            url = "https://www.alphavantage.co/query?"
            for key, val in params.items():
                url+=key
                url+='='
                url+=val
                url+='&'
            url = url[:-1]
            # print(url)
            r = requests.get(url)
            data = r.json()
            # print(data)
            df = pd.DataFrame.from_dict(data['Time Series (1min)'], orient='index')
            df = df.sort_index()
            # print(df.index)
            if start_new_csv:
                df1 = df
            else:
                df1 = pd.concat([df1, df], ignore_index=False)
            start_new_csv = False
            # print(df1)
            # df.to_csv(path + params['symbol'] + params['month'] + '.csv.gz', compression='gzip')
    # csv_files = glob.glob(os.path.join(os.getcwd(), f"*{params['symbol']}*.csv.gz"))

    # combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
    df1.to_csv(f"{path + params['symbol']}_combined.csv.gz", compression='gzip', index=True)



In [None]:
df = pd.read_csv(f"{path}LLY_combined.csv.gz")
    
returns = pd.DataFrame({
"time_stamp": df['Unnamed: 0'],
"log_return": np.log(1+(df['4. close']-df['1. open'])/df['1. open']),
"volume": df['5. volume']
})

moving_average_types = ['volatility', 'daily_returns', 'daily_volu']
returns['time_stamp'] = pd.to_datetime(returns['time_stamp'], errors='coerce')

daily_volatility = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].std()
volatility_df = pd.DataFrame(daily_volatility).rename(columns={'log_return': 'volatility'})

daily_returns = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].sum()

pd.DataFrame(daily_returns)

In [120]:
#Get daily volatility as target
#get moving averages for 1, 5, 22, and 252 days
MA = ["1", "5", "22", "44"]
for stock in stocks:
    df = pd.read_csv(f"{path + stock}_combined.csv.gz")
    
    returns = pd.DataFrame({
    "time_stamp": df['Unnamed: 0'],
    "log_return": np.log(1+(df['4. close']-df['1. open'])/df['1. open']),
    "volume": df['5. volume']
    })
    returns['time_stamp'] = pd.to_datetime(returns['time_stamp'], errors='coerce')

    #volatility
    # daily_volatility = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].std()
    # volatility_df = pd.DataFrame(daily_volatility).rename(columns={'log_return': 'volatility'})
    # volatility_df.to_csv(f"{path + stock}_DAILY_VOLATILITY.csv.gz", compression='gzip')

    #volatility moving average
    # vol_moving_average_df = pd.DataFrame()

    # for window in MA:
    #     column_name = f'MA_{window}'
    #     vol_moving_average_df[column_name] = volatility_df['volatility'].shift(1).rolling(window=int(window)).mean()

    # vol_moving_average_df.to_csv(f"{path + stock}_MOVING_AVERAGE.csv.gz", compression='gzip')

    # #log returns moving average
    # daily_returns = pd.Da

    moving_average_types = ['volatility', 'log_return', 'volume']
    for type in moving_average_types:
        if type == "volatility":
            daily_value = returns.groupby(returns['time_stamp'].dt.floor("1D"))["log_return"].std()
            df = pd.DataFrame(daily_value).rename(columns={'log_return': 'volatility'})
            df.to_csv(f"{path + stock}_DAILY_VOLATILITY.csv.gz", compression='gzip')
            
        else:
            daily_value = returns.groupby(returns['time_stamp'].dt.floor("1D"))[type].sum()
            df = pd.DataFrame(daily_value).rename(columns={'log_return': type})
        moving_average_df = pd.DataFrame()
        for window in MA:
            column_name = f'MA_{window}_{type}'
            moving_average_df[column_name] = df[type].shift(1).rolling(window=int(window)).mean()
        moving_average_df.to_csv(f"{path+stock}_{type}_MOVING_AVERAGE.csv.gz", compression='gzip')

In [None]:
secondary_metrics = ["EARNINGS", "INCOME_STATEMENT", "BALANCE_SHEET", "CASH_FLOW", "SHARES_OUTSTANDING"]

In [None]:
def year_range(df, time_name, metric):
    df[time_name] = pd.to_datetime(df[time_name])
    if metric == "EARNINGS":
        df = df[df[time_name].dt.year > 2019]
    if metric == "SHARES_OUTSTANDING":
        df = df[df[time_name].date < date(2025, 6, 1)]
    return df

In [None]:
def get_NA_columns(df, na_column_set):
    na_columns = df.columns[df.isna().any()].tolist()
    na_column_set.update(na_columns)
    return na_column_set

In [None]:
na_column_set = set()
secondary_dfs = dict()
for stock in stocks:
    # print(stock)
    for metric in secondary_metrics:
        # print(metric)
        url = f'https://www.alphavantage.co/query?function={metric}&symbol={stock}&apikey={os.environ["API_KEY"]}'
        # print(url)
        r = requests.get(url)
        # print(r)
        data = r.json()
        # print(data)
        if metric == "EARNINGS":
            df = year_range(pd.DataFrame(data['quarterlyEarnings']), 'reportedDate')
            df = df[['reportedDate', 'surprise', 'surprisePercentage']]
            df = df.rename(columns={'reportedDate': 'time_stamp'})
        elif metric == "SHARES_OUTSTANDING":
            df = year_range(pd.DataFrame(data['data']), 'date')
            df = df.rename(columns={'date': 'time_stamp'})
        else:
            df = year_range(pd.DataFrame(data['annualReports']), 'fiscalDateEnding')
            df = df.rename(columns={'fiscalDateEnding': 'time_stamp'})

        prefix = f"{metric}: "
        df.columns = [
            f"{prefix}{col}" if col != "time_stamp" else col
            for col in df.columns
        ]
        df = df.replace(["None", "null", "NA", "NaN", ""], pd.NA)

        na_column_set = get_NA_columns(df, na_column_set)
        key_name = f"{stock}: {metric}"
        secondary_dfs[key_name] = df        


In [None]:
na_column_list = na_column_set

for stock in stocks:
    for metric in secondary_metrics:
        key_name = f"{stock}: {metric}"
        sublist = [s for s in na_column_list if metric in s]
        df = secondary_dfs[key_name]
        df = df.drop(columns=list(sublist), axis=1)
        df.to_csv(f"{path + stock}_{metric}.csv.gz", compression='gzip')

In [None]:
file_path = path+'calendar.txt'
cal_dict = {}
try:
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, 1):
            value = line.strip()
            if value[0]!='2':
                key=value
                cal_dict[key] = []
            else:
                yy=value[:4]
                mm=value[4:6]
                dd=value[6:]
                cal_dict[key].append(yy+'-'+mm+'-'+dd)
            print(f"Line {line_number}: {line.strip()}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
for key in list(cal_dict.keys()):
    df = pd.DataFrame(cal_dict[key]).rename(columns={0: "time_stamp"})
    df.to_csv(f"{path+key}_CALDATES.csv.gz", compression="gzip")
