In [1]:
import pandas as pd
from functools import reduce
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder

In [2]:
path = "data_files/"
column_names = ['Calendar Day', 'Stock', 'Moving Average', 'Balance Sheet', 'Cash Flow', 'Income Statement',
                'Shares Outstanding']
metrics = ["BALANCE_SHEET", "CASH_FLOW", "INCOME_STATEMENT", "SHARES_OUTSTANDING"]
stock_dataframe = pd.read_csv(f"data/stocks_with_data.csv")
stocks = ["AAPL", "LLY", "NEE"]
daily_metrics = ["DAILY_VOLATILITY", "log_return_MOVING_AVERAGE", "volatility_MOVING_AVERAGE", "volume_MOVING_AVERAGE"]
yearly_metrics = ["BALANCE_SHEET", "CASH_FLOW", "INCOME_STATEMENT"]
years = ['2020', '2021', '2022', '2023', '2024', '2025']

In [3]:
calendar_codes = {"FOMC": 1, "HalfDays": 2, "MonthEnd": 3, "MSCIrebal": 4, "OptionExpiry": 5, "QuarterEnd": 6, "RusselRebal": 7, "TripleWitch": 8}

In [4]:
def combine_datasets_by_stock(stock, daily_metrics):
    stock_dataframes = []
    for metric in daily_metrics:
        df = pd.read_csv(f"{path+stock+"/"+ stock}_{metric}.csv.gz")
        stock_dataframes.append(df)
    merged_df = reduce(lambda left, right: pd.merge(left, right, on='time_stamp', how='outer'), stock_dataframes)
    merged_df['time_stamp'] = pd.to_datetime(merged_df['time_stamp'])
    merged_df.insert(loc=0, column='Stock', value=stock)
    return merged_df
    

In [7]:
first_df = pd.DataFrame()
for day in list(calendar_codes.keys()):
    calendar_df = pd.read_csv(f"{path+day}_CALDATES.csv.gz")
    calendar_df["Day"] = day
    calendar_df["day_code"] = calendar_codes[day]
    break
calendar_df

Unnamed: 0.1,Unnamed: 0,time_stamp,Day,day_code
0,0,2020-01-29,FOMC,1
1,1,2020-03-18,FOMC,1
2,2,2020-04-29,FOMC,1
3,3,2020-06-10,FOMC,1
4,4,2020-07-29,FOMC,1
5,5,2020-09-16,FOMC,1
6,6,2020-11-05,FOMC,1
7,7,2020-12-16,FOMC,1
8,8,2021-01-27,FOMC,1
9,9,2021-03-17,FOMC,1


In [5]:
def add_cal_dates(calendar_codes, df):
    df["CALENDAR_DAYS"] = 0
    for day in list(calendar_codes.keys()):
        calendar_df = pd.read_csv(f"{path+day}_CALDATES.csv.gz")
        for i in calendar_df['time_stamp']:
            df.loc[df['time_stamp'] == i, "CALENDAR_DAYS"] = calendar_codes[day]
    return df

In [8]:
#keep track of which quarterly earnings release they are on
#when it passes, go to 0

def earnings_release(df, stock):
    quarterly_earnings_df = pd.read_csv(f'{path+stock+"/"+ stock}_EARNINGS.csv.gz')
    df["QUARTERLY_EARNINGS"] = 5
    encoding_dates_df = pd.DataFrame()
    quarterly_earnings_df['time_stamp'] = pd.to_datetime(quarterly_earnings_df['time_stamp'])
    quarterly_earnings_df = quarterly_earnings_df.sort_values(by='time_stamp', ascending=True)
    for earning_release_date in quarterly_earnings_df['time_stamp']:
        df_backward = pd.DataFrame()
        dates_backward = pd.date_range(end=earning_release_date, periods=6, freq="D")
        df_backward["time_stamp"] = dates_backward
        df_backward["QUARTERLY_EARNINGS"] = (earning_release_date - df_backward["time_stamp"]).dt.days
        df_forward = pd.DataFrame()
        dates_forward = pd.date_range(start=earning_release_date, periods=5, freq="D")
        df_forward["time_stamp"] = dates_forward
        df_forward["QUARTERLY_EARNINGS"] = (earning_release_date - df_forward["time_stamp"]).dt.days
        combined = pd.concat([df_forward, df_backward]).sort_values(by="time_stamp", ascending=True).drop_duplicates()
        encoding_dates_df = pd.concat([encoding_dates_df, combined])
    mapping = dict(zip(encoding_dates_df["time_stamp"], encoding_dates_df["QUARTERLY_EARNINGS"]))
    df.loc[df["time_stamp"].isin(mapping.keys()), "QUARTERLY_EARNINGS"] = df.loc[
        df["time_stamp"].isin(mapping.keys()), "time_stamp"
    ].map(mapping)
    return df

In [108]:
def add_yearly_metrics_revised(df, yearly_metrics, stock, years):
    metric_dataframes = []
    for metric in yearly_metrics:
        file_path = f"{path}{stock}_{metric}.csv.gz"
        yearly_metric_df = (
            pd.read_csv(file_path)
            .drop(columns=['Unnamed: 0'], errors='ignore')
            # .dropna(axis=1)
        )
        yearly_metric_df["time_stamp"] = pd.to_datetime(yearly_metric_df["time_stamp"])
        year_dfs = []
        all_dates_df = pd.DataFrame({"time_stamp": df["time_stamp"]})

        for year in years:
            mask = all_dates_df["time_stamp"].dt.year == int(year)
            year_specific_df = all_dates_df.loc[mask].copy()
            year_specific_data = yearly_metric_df[
                yearly_metric_df["time_stamp"].dt.year == int(year)
            ].drop(columns=["time_stamp"], errors="ignore")

            if not year_specific_data.empty:
                for measurement in year_specific_data.columns:
                    value = year_specific_data[measurement].values[0]
                    year_specific_df.loc[:, measurement] = value
            year_dfs.append(year_specific_df)
        metric_df = pd.concat(year_dfs, ignore_index=False)
        metric_dataframes.append(metric_df)

    all_metrics_df = reduce(
        lambda left, right: pd.merge(left, right, on="time_stamp", how="outer"),
        metric_dataframes
    )
    result_df = pd.merge(df, all_metrics_df, on="time_stamp", how="inner")
    return result_df

In [109]:
shares_df = pd.read_csv(f"data_files/AAPL_SHARES_OUTSTANDING.csv.gz")
shares_df = shares_df.sort_values(by='time_stamp', ascending=True).reset_index(drop=True)
cal_shares = pd.DataFrame()
print(list(shares_df["time_stamp"])[-1])
for i in range(len(list(shares_df["time_stamp"]))):
    if i+1 == len(list(shares_df["time_stamp"])):
        # set_end = list(df['time_stamp'])[-1]
        set_end = pd.Timestamp("2025-12-12")
    else:
        set_end = pd.Timestamp(shares_df["time_stamp"][i+1])
    # print(pd.Timestamp(shares_df["time_stamp"][i]))
    # print(set_end)
    dates = pd.date_range(start=pd.Timestamp(shares_df["time_stamp"][i]), end=set_end, freq="D")
    # print(dates)
    inst_df = pd.DataFrame({
        "time_stamp": dates,
        "SHARES_OUTSTANDING_BASIC": shares_df["SHARES_OUTSTANDING: shares_outstanding_basic"][i]
    })
    cal_shares = pd.concat([cal_shares, inst_df])

#     df["Month-Year"] = str(df["time_stamp"].dt.month + "-" df["time_stamp"].dt.year)
#     month_year_df = pd.DataFrame({"Month-Year": df["Month-Year"]})
    

2025-06-28


In [110]:
def add_shares_outstanding(stock, df):
    shares_df = pd.read_csv(f"{path+stock}_SHARES_OUTSTANDING.csv.gz")
    shares_df = shares_df.sort_values(by='time_stamp', ascending=True).reset_index(drop=True)
    cal_shares = pd.DataFrame()
    for i in range(len(list(shares_df["time_stamp"]))):
        if i+1 == len(list(shares_df["time_stamp"])):
            # set_end = list(df['time_stamp'])[-1]
            set_end = pd.Timestamp("2025-12-12")
        else:
            set_end = pd.Timestamp(shares_df["time_stamp"][i+1])
        # print(pd.Timestamp(shares_df["time_stamp"][i]))
        # print(set_end)
        dates = pd.date_range(start=pd.Timestamp(shares_df["time_stamp"][i]), end=set_end, freq="D")
        # print(dates)
        inst_df = pd.DataFrame({
            "time_stamp": dates,
            "SHARES_OUTSTANDING_BASIC": shares_df["SHARES_OUTSTANDING: shares_outstanding_basic"][i]
        })
        cal_shares = pd.concat([cal_shares, inst_df])
    df = pd.merge(df, cal_shares, on="time_stamp", how="inner")
    return df
    

In [24]:
len(stocks)

475

In [None]:
stock_df = []
for stock in stocks:
    df = combine_datasets_by_stock(stock, daily_metrics)
    df = add_cal_dates(calendar_codes, df)
    df = earnings_release(df, stock)
    # df = add_yearly_metrics_revised(df, yearly_metrics, stock, years)
    # df = add_shares_outstanding(stock, df)
    
    
    stock_df.append(df)
final_dataset = pd.concat(stock_df, axis=0, ignore_index=False).dropna()
le = LabelEncoder()
final_dataset["Stock"] = le.fit_transform(final_dataset["Stock"])
# final_dataset = final_dataset.drop(columns=['BALANCE_SHEET: reportedCurrency', 'INCOME_STATEMENT: reportedCurrency', 'CASH_FLOW: reportedCurrency'])
# testing_years = [2020,2021,2022,2023,2024]
# training_dataset = final_dataset[final_dataset["time_stamp"].dt.year.isin(testing_years)]
# testing_dataset = final_dataset[final_dataset["time_stamp"].dt.year==2025]

# final_dataset['time_stamp'] = final_dataset['time_stamp'].astype('int64') // 10**9
# final_dataset.to_csv("final_dataset5.csv")


In [10]:
final_dataset.to_csv("final_dataset_LSTM.csv.gz")

In [25]:
training_dataset["Stock"].max()

474

In [26]:
training_dataset['time_stamp'] = training_dataset['time_stamp'].astype('int64') // 10**9
training_dataset.to_csv('training_dataset_tech.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_dataset['time_stamp'] = training_dataset['time_stamp'].astype('int64') // 10**9


In [27]:
testing_dataset['time_stamp'] = testing_dataset['time_stamp'].astype('int64') // 10**9
testing_dataset.to_csv('testing_dataset_tech.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_dataset['time_stamp'] = testing_dataset['time_stamp'].astype('int64') // 10**9
