In [None]:
import os
import pandas as pd
import yfinance as yf
from tqdm import tqdm

def augment_ticker_dataset(ticker, vix, sp500, nasdaq,
                           starting_date='2012-01-01',
                           ending_date='2020-01-01',
                           splitting_date='2018-01-01'):
    def classify_returns(returns):
        if returns > 2.5:
            return 2  # VERY HIGH
        elif returns > 0.5:
            return 1  # HIGH
        elif returns < -2.5:
            return -2  # VERY LOW
        elif returns < -0.5:
            return -1  # LOW
        else:
            return 0  # NEUTRAL

    def get_forecasts(returns, date_series):
        monthly_forecast = []
        months = date_series.dt.to_period('M').unique()

        for i in range(len(months) - 1):
            current_month = months[i]
            next_month = months[i + 1]
            next_month_return = returns[date_series.dt.to_period('M') == next_month].mean()
            classification = classify_returns(next_month_return)
            monthly_forecast.extend([classification] * sum(date_series.dt.to_period('M') == current_month))

        monthly_forecast.extend([-2] * sum(date_series.dt.to_period('M') == months[-1]))
        return monthly_forecast

    file_path = f'./data/{ticker}_2012-1-1_2020-1-1.csv'
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])

    df['Returns'] = df['Close'].pct_change() * 100

    df = df.merge(vix[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_VIX'))
    df = df.merge(sp500[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_SP500'))
    df = df.merge(nasdaq[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_Nasdaq'))

    df.fillna(0, inplace=True)
    df['ES'] = df['Returns'].apply(classify_returns)
    df['MS'] = df['Returns_SP500'].apply(classify_returns)
    df['IV'] = df['Returns_VIX'].apply(classify_returns)
    df['MF'] = df['Returns_SP500'].apply(classify_returns)
    df['SF'] = df['Returns_Nasdaq'].apply(classify_returns)

    df['MFCST'] = get_forecasts(df['Returns_SP500'], df['Date'])
    df['EFCST'] = get_forecasts(df['Returns'], df['Date'])

    starting_date = pd.to_datetime(starting_date)
    ending_date = pd.to_datetime(ending_date)
    splitting_date = pd.to_datetime(splitting_date)
    df_train = df[(df['Date'] >= starting_date) & (df['Date'] < splitting_date)]
    df_test = df[(df['Date'] >= splitting_date) & (df['Date'] <= ending_date)]

    train_path = f'./data/{ticker}_aug_2012-1-1_2018-1-1.csv'
    test_path = f'./data/{ticker}_aug_2018-1-1_2020-1-1.csv'

    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)

    return df, df_train, df_test

def download_market_data():
    print("Downloading VIX, SP500, and NASDAQ data...")
    vix = yf.download('^VIX', start='2012-01-01', end='2020-01-01')
    sp500 = yf.download('^GSPC', start='2012-01-01', end='2020-01-01')
    nasdaq = yf.download('^IXIC', start='2012-01-01', end='2020-01-01')

    vix['Returns'] = vix['Adj Close'].pct_change() * 100
    sp500['Returns'] = sp500['Adj Close'].pct_change() * 100
    nasdaq['Returns'] = nasdaq['Adj Close'].pct_change() * 100

    return vix, sp500, nasdaq

def process_files_in_directory(directory):
    vix, sp500, nasdaq = download_market_data()
    for filename in tqdm(os.listdir(directory), desc=f"Walking Dir {directory}"):
        if filename.endswith('.csv') and '_aug_' not in filename:  # Skip already augmented files
            ticker = filename.split('_')[0]  # Extract ticker from filename
            print(f'Processing file: {filename}')
            augment_ticker_dataset(ticker, vix, sp500, nasdaq)


process_files_in_directory('./data/')