In [None]:
import pandas as pd
from functools import reduce
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder

In [None]:
path = "data_files/"
column_names = ['Calendar Day', 'Stock', 'Moving Average', 'Balance Sheet', 'Cash Flow', 'Income Statement',
                'Shares Outstanding']
metrics = ["BALANCE_SHEET", "CASH_FLOW", "INCOME_STATEMENT", "SHARES_OUTSTANDING"]
stocks = ["AAPL","LLY","NEE"]
daily_metrics = ["DAILY_VOLATILITY", "log_return_MOVING_AVERAGE", "volatility_MOVING_AVERAGE", "volume_MOVING_AVERAGE"]
yearly_metrics = ["BALANCE_SHEET", "CASH_FLOW", "INCOME_STATEMENT"]
years = ['2020', '2021', '2022', '2023', '2024', '2025']

In [None]:
calendar_codes = {"FOMC": 1, "HalfDays": 2, "MonthEnd": 3, "MSCIrebal": 4, "OptionExpiry": 5, "QuarterEnd": 6, "RusselRebal": 7, "TripleWitch": 8}

In [None]:
def combine_datasets_by_stock(stock, daily_metrics):
    stock_dataframes = []
    for metric in daily_metrics:
        df = pd.read_csv(f"{path + stock}_{metric}.csv.gz")
        stock_dataframes.append(df)
    merged_df = reduce(lambda left, right: pd.merge(left, right, on='time_stamp', how='outer'), stock_dataframes)
    merged_df['time_stamp'] = pd.to_datetime(merged_df['time_stamp'])
    merged_df.insert(loc=0, column='Stock', value=stock)
    return merged_df
    

In [None]:
def add_cal_dates(calendar_codes, df):
    df["CALENDAR_DAYS"] = 0
    for day in list(calendar_codes.keys()):
        calendar_df = pd.read_csv(f"{path+day}_CALDATES.csv.gz")
        for i in calendar_df['time_stamp']:
            df.loc[df['time_stamp'] == i, "CALENDAR_DAYS"] = calendar_codes[day]
    return df

In [None]:
#keep track of which quarterly earnings release they are on
#when it passes, go to 0

def earnings_release(df, stock):
    quarterly_earnings_df = pd.read_csv(f'{path+stock}_EARNINGS.csv.gz')
    quarterly_earnings_df['time_stamp'] = pd.to_datetime(quarterly_earnings_df['time_stamp'])
    quarterly_earnings_df = quarterly_earnings_df.sort_values(by='time_stamp', ascending=True)
    df["QUARTERLY_EARNINGS"] = np.nan
    i = 0
    for time in list(df['time_stamp']):
        if i > 23:
            i == 23
        x = int((quarterly_earnings_df['time_stamp'].iloc[i] - time).days)
        if x == 0:
            i += 1
            #the next one find the difference between the time and the next quarterly start date
        if x > 5:
            x = 5
        if x < -4:
            x = -4
        df.loc[df['time_stamp'] == time, "QUARTERLY_EARNINGS"] = x
    return df

In [None]:
def add_yearly_metrics_revised(df, yearly_metrics, stock, years):
    metric_dataframes = []
    for metric in yearly_metrics:
        print("Processing metric:", metric)
        file_path = f"{path}{stock}_{metric}.csv.gz"
        yearly_metric_df = (
            pd.read_csv(file_path)
            .drop(columns=['Unnamed: 0'], errors='ignore')
            # .dropna(axis=1)
        )
        yearly_metric_df["time_stamp"] = pd.to_datetime(yearly_metric_df["time_stamp"])

        year_dfs = []
        all_dates_df = pd.DataFrame({"time_stamp": df["time_stamp"]})

        for year in years:
            mask = all_dates_df["time_stamp"].dt.year == int(year)
            year_specific_df = all_dates_df.loc[mask].copy()
            year_specific_data = yearly_metric_df[
                yearly_metric_df["time_stamp"].dt.year == int(year)
            ].drop(columns=["time_stamp"], errors="ignore")

            if not year_specific_data.empty:
                for measurement in year_specific_data.columns:
                    value = year_specific_data[measurement].values[0]
                    year_specific_df.loc[:, measurement] = value
            year_dfs.append(year_specific_df)
        metric_df = pd.concat(year_dfs, ignore_index=False)
        metric_dataframes.append(metric_df)

    all_metrics_df = reduce(
        lambda left, right: pd.merge(left, right, on="time_stamp", how="outer"),
        metric_dataframes
    )
    result_df = pd.merge(df, all_metrics_df, on="time_stamp", how="inner")
    return result_df

In [None]:
def shares_outstanding(stock, df):
    shares_df = pd.read_csv(f"{path+stock}_SHARES_OUTSTANDING.csv.gz")
    df["Month-Year"] = str(df["time_stamp"].dt.month + "-" df["time_stamp"].dt.year)
    month_year_df = pd.DataFrame({"Month-Year": df["Month-Year"]})
    
    

In [None]:
stock_df = []
for stock in stocks:
    df = combine_datasets_by_stock(stock, daily_metrics)
    df = add_cal_dates(calendar_codes, df)
    df = earnings_release(df, stock)
    df = add_yearly_metrics_revised(df, yearly_metrics, stock, years)
    stock_df.append(df)
final_dataset = pd.concat(stock_df, axis=0, ignore_index=False).dropna()
le = LabelEncoder()
final_dataset["Stock"] = le.fit_transform(final_dataset["Stock"])
final_dataset['time_stamp'] = final_dataset['time_stamp'].astype('int64') // 10**9
final_dataset = final_dataset.drop(columns=['BALANCE_SHEET: reportedCurrency', 'INCOME_STATEMENT: reportedCurrency', 'CASH_FLOW: reportedCurrency'])
final_dataset
final_dataset.to_csv("final_dataset3.csv")
