In [1]:
import os
import pandas as pd
import yfinance as yf
from tqdm import tqdm

def augment_ticker_dataset(ticker, vix, sp500, nasdaq,
                           starting_date='2012-01-01',
                           ending_date='2020-01-01',
                           splitting_date='2018-01-01'):
    def classify_returns(returns):
        if returns > 2.5:
            return 2  # VERY HIGH
        elif returns > 0.5:
            return 1  # HIGH
        elif returns < -2.5:
            return -2  # VERY LOW
        elif returns < -0.5:
            return -1  # LOW
        else:
            return 0  # NEUTRAL

    def get_forecasts(returns, date_series):
        weekly_forecast = []
        weeks = date_series.dt.to_period('W').unique()  # Get unique weeks

        for i in range(len(weeks) - 1):  # Iterate over weeks, but stop before the last
            current_week = weeks[i]
            next_two_weeks = weeks[i + 1:i + 3]  # Look at the next two weeks

            # Calculate the average return for the next two weeks
            next_two_weeks_return = returns[date_series.dt.to_period('W').isin(next_two_weeks)].mean()
            classification = classify_returns(next_two_weeks_return)

            # Assign this classification to the whole current week
            weekly_forecast.extend([classification] * sum(date_series.dt.to_period('W') == current_week))

        # For the last week, we can't look ahead, so set it to NEUTRAL or any fallback value
        weekly_forecast.extend([0] * sum(date_series.dt.to_period('W') == weeks[-1]))

        return weekly_forecast

    file_path = f'./data/{ticker}_2012-1-1_2020-1-1.csv'
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])

    df['Returns'] = df['Close'].pct_change() * 100

    df = df.merge(vix[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_VIX'))
    df = df.merge(sp500[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_SP500'))
    df = df.merge(nasdaq[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_Nasdaq'))

    df.fillna(0, inplace=True)
    df['ES'] = df['Returns'].apply(classify_returns)
    df['MS'] = df['Returns_SP500'].apply(classify_returns)
    df['IV'] = df['Returns_VIX'].apply(classify_returns)
    df['MF'] = df['Returns_SP500'].apply(classify_returns)
    df['SF'] = df['Returns_Nasdaq'].apply(classify_returns)

    # Update to look at the next two weeks for forecast
    df['MFCST'] = get_forecasts(df['Returns_SP500'], df['Date'])
    df['EFCST'] = get_forecasts(df['Returns'], df['Date'])

    starting_date = pd.to_datetime(starting_date)
    ending_date = pd.to_datetime(ending_date)
    splitting_date = pd.to_datetime(splitting_date)
    df_train = df[(df['Date'] >= starting_date) & (df['Date'] < splitting_date)]
    df_test = df[(df['Date'] >= splitting_date) & (df['Date'] <= ending_date)]

    train_path = f'./data/{ticker}_aug_2012-1-1_2018-1-1.csv'
    test_path = f'./data/{ticker}_aug_2018-1-1_2020-1-1.csv'

    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)

    return df, df_train, df_test

def download_market_data():
    print("Downloading VIX, SP500, and NASDAQ data...")
    vix = yf.download('^VIX', start='2012-01-01', end='2020-01-01')
    sp500 = yf.download('^GSPC', start='2012-01-01', end='2020-01-01')
    nasdaq = yf.download('^IXIC', start='2012-01-01', end='2020-01-01')

    vix['Returns'] = vix['Adj Close'].pct_change() * 100
    sp500['Returns'] = sp500['Adj Close'].pct_change() * 100
    nasdaq['Returns'] = nasdaq['Adj Close'].pct_change() * 100

    return vix, sp500, nasdaq

def process_files_in_directory(directory):
    vix, sp500, nasdaq = download_market_data()
    for filename in tqdm(os.listdir(directory), desc=f"Walking Dir {directory}"):
        if filename.endswith('.csv') and '_aug_' not in filename:  # Skip already augmented files
            ticker = filename.split('_')[0]  # Extract ticker from filename
            print(f'Processing file: {filename}')
            augment_ticker_dataset(ticker, vix, sp500, nasdaq)

process_files_in_directory('./data/')


Downloading VIX, SP500, and NASDAQ data...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
Walking Dir ./data/:   0%|          | 0/126 [00:00<?, ?it/s]

Processing file: XOM_2012-1-1_2018-1-1.csv


Walking Dir ./data/:   2%|▏         | 2/126 [00:00<00:42,  2.91it/s]

Processing file: SIE.DE_2012-1-1_2020-1-1.csv


Walking Dir ./data/:   3%|▎         | 4/126 [00:01<00:39,  3.10it/s]

Processing file: 6758.T_2012-1-1_2020-1-1.csv


Walking Dir ./data/:   4%|▍         | 5/126 [00:01<00:49,  2.43it/s]

Processing file: JPM_2012-1-1_2020-1-1.csv


Walking Dir ./data/:   5%|▍         | 6/126 [00:02<00:56,  2.12it/s]

Processing file: AMZN_2018-1-1_2020-1-1.csv


Walking Dir ./data/:   6%|▋         | 8/126 [00:03<00:47,  2.47it/s]

Processing file: 7203.T_2012-1-1_2020-1-1.csv


Walking Dir ./data/:   7%|▋         | 9/126 [00:03<00:54,  2.14it/s]

Processing file: HSBC_2018-1-1_2020-1-1.csv


Walking Dir ./data/:   8%|▊         | 10/126 [00:04<00:59,  1.94it/s]

Processing file: BIDU_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  10%|▉         | 12/126 [00:05<00:49,  2.29it/s]

Processing file: TSLA_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  10%|█         | 13/126 [00:05<00:54,  2.07it/s]

Processing file: MSFT_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  11%|█         | 14/126 [00:06<00:58,  1.90it/s]

Processing file: EZU_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  13%|█▎        | 16/126 [00:07<00:48,  2.26it/s]

Processing file: 0700.HK_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  13%|█▎        | 17/126 [00:07<00:52,  2.07it/s]

Processing file: 0939.HK_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  14%|█▍        | 18/126 [00:08<00:56,  1.92it/s]

Processing file: 6758.T_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  15%|█▌        | 19/126 [00:08<00:58,  1.82it/s]

Processing file: 6758.T_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  16%|█▌        | 20/126 [00:09<00:59,  1.77it/s]

Processing file: 2503.T_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  19%|█▉        | 24/126 [00:10<00:32,  3.16it/s]

Processing file: TSLA_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  20%|█▉        | 25/126 [00:10<00:38,  2.63it/s]

Processing file: SPY_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  21%|██        | 26/126 [00:11<00:44,  2.27it/s]

Processing file: MSFT_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  22%|██▏       | 28/126 [00:12<00:39,  2.51it/s]

Processing file: 7203.T_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  24%|██▍       | 30/126 [00:12<00:35,  2.72it/s]

Processing file: GOOGL_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  25%|██▌       | 32/126 [00:13<00:32,  2.86it/s]

Processing file: 2503.T_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  26%|██▌       | 33/126 [00:14<00:37,  2.46it/s]

Processing file: GOOGL_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  27%|██▋       | 34/126 [00:14<00:41,  2.22it/s]

Processing file: EZU_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  28%|██▊       | 35/126 [00:15<00:44,  2.04it/s]

Processing file: AAPL_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  32%|███▏      | 40/126 [00:16<00:22,  3.77it/s]

Processing file: HSBC_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  33%|███▎      | 41/126 [00:16<00:27,  3.08it/s]

Processing file: QQQ_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  35%|███▍      | 44/126 [00:17<00:22,  3.59it/s]

Processing file: AMZN_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  38%|███▊      | 48/126 [00:17<00:18,  4.28it/s]

Processing file: KO_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  40%|███▉      | 50/126 [00:18<00:19,  3.94it/s]

Processing file: QQQ_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  40%|████      | 51/126 [00:19<00:23,  3.24it/s]

Processing file: NOK_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  41%|████▏     | 52/126 [00:19<00:27,  2.74it/s]

Processing file: PHIA.AS_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  43%|████▎     | 54/126 [00:20<00:25,  2.85it/s]

Processing file: AAPL_2016-1-1_2018-1-1.csv


Walking Dir ./data/:  44%|████▍     | 56/126 [00:21<00:23,  2.93it/s]

Processing file: PHIA.AS_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  45%|████▌     | 57/126 [00:21<00:27,  2.51it/s]

Processing file: SIE.DE_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  47%|████▋     | 59/126 [00:22<00:24,  2.76it/s]

Processing file: ABI.BR_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  48%|████▊     | 61/126 [00:23<00:22,  2.88it/s]

Processing file: EWJ_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  49%|████▉     | 62/126 [00:23<00:25,  2.49it/s]

Processing file: ABI.BR_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  50%|█████     | 63/126 [00:24<00:28,  2.21it/s]

Processing file: AAPL_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  52%|█████▏    | 65/126 [00:24<00:24,  2.51it/s]

Processing file: BABA_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  52%|█████▏    | 66/126 [00:25<00:23,  2.55it/s]

Processing file: EZU_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  53%|█████▎    | 67/126 [00:25<00:25,  2.27it/s]

Processing file: 0939.HK_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  56%|█████▌    | 70/126 [00:26<00:17,  3.13it/s]

Processing file: 2503.T_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  56%|█████▋    | 71/126 [00:27<00:20,  2.64it/s]

Processing file: VOW3.DE_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  57%|█████▋    | 72/126 [00:27<00:23,  2.32it/s]

Processing file: TSLA_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  58%|█████▊    | 73/126 [00:28<00:25,  2.10it/s]

Processing file: BABA_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  60%|██████    | 76/126 [00:28<00:15,  3.29it/s]

Processing file: BIDU_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  61%|██████    | 77/126 [00:29<00:17,  2.73it/s]

Processing file: QQQ_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  63%|██████▎   | 80/126 [00:30<00:13,  3.32it/s]

Processing file: PHIA.AS_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  67%|██████▋   | 84/126 [00:30<00:09,  4.22it/s]

Processing file: NOK_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  67%|██████▋   | 85/126 [00:31<00:11,  3.47it/s]

Processing file: SPY_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  68%|██████▊   | 86/126 [00:31<00:13,  2.91it/s]

Processing file: AMZN_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  69%|██████▉   | 87/126 [00:32<00:15,  2.53it/s]

Processing file: VOW3.DE_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  70%|██████▉   | 88/126 [00:33<00:16,  2.26it/s]

Processing file: AAPL_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  71%|███████   | 89/126 [00:33<00:17,  2.08it/s]

Processing file: 0700.HK_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  74%|███████▍  | 93/126 [00:34<00:09,  3.40it/s]

Processing file: 0939.HK_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  75%|███████▍  | 94/126 [00:34<00:11,  2.89it/s]

Processing file: VOW3.DE_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  75%|███████▌  | 95/126 [00:35<00:12,  2.49it/s]

Processing file: HSBC_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  79%|███████▊  | 99/126 [00:36<00:07,  3.68it/s]

Processing file: JPM_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  80%|████████  | 101/126 [00:36<00:07,  3.49it/s]

Processing file: JPM_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  82%|████████▏ | 103/126 [00:37<00:06,  3.37it/s]

Processing file: SPY_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  83%|████████▎ | 104/126 [00:38<00:07,  2.82it/s]

Processing file: 0700.HK_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  85%|████████▍ | 107/126 [00:38<00:05,  3.36it/s]

Processing file: KO_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  86%|████████▌ | 108/126 [00:39<00:06,  2.87it/s]

Processing file: XOM_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  87%|████████▋ | 109/126 [00:40<00:06,  2.52it/s]

Processing file: KO_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  87%|████████▋ | 110/126 [00:40<00:07,  2.21it/s]

Processing file: XOM_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  90%|████████▉ | 113/126 [00:41<00:04,  3.03it/s]

Processing file: SIE.DE_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  91%|█████████▏| 115/126 [00:41<00:03,  3.04it/s]

Processing file: BABA_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  93%|█████████▎| 117/126 [00:42<00:02,  3.55it/s]

Processing file: ABI.BR_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  94%|█████████▎| 118/126 [00:42<00:02,  2.86it/s]

Processing file: GOOGL_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  94%|█████████▍| 119/126 [00:43<00:02,  2.37it/s]

Processing file: NOK_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  95%|█████████▌| 120/126 [00:44<00:02,  2.09it/s]

Processing file: MSFT_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  96%|█████████▌| 121/126 [00:44<00:02,  1.92it/s]

Processing file: EWJ_2012-1-1_2020-1-1.csv


Walking Dir ./data/:  97%|█████████▋| 122/126 [00:45<00:02,  1.81it/s]

Processing file: 7203.T_2018-1-1_2020-1-1.csv


Walking Dir ./data/:  98%|█████████▊| 123/126 [00:46<00:01,  1.74it/s]

Processing file: BIDU_2012-1-1_2018-1-1.csv


Walking Dir ./data/:  99%|█████████▉| 125/126 [00:46<00:00,  2.18it/s]

Processing file: EWJ_2012-1-1_2018-1-1.csv


Walking Dir ./data/: 100%|██████████| 126/126 [00:47<00:00,  2.65it/s]
