In [1]:
import yfinance as yf
import pandas as pd
import numpy as np

TICKER = 'TSLA'

file_path = f'./data/{TICKER}_2012-1-1_2020-1-1.csv'
df = pd.read_csv(file_path)
df['Date'] = pd.to_datetime(df['Date'])

vix = yf.download('^VIX', start=df['Date'].min(), end=df['Date'].max())
sp500 = yf.download('^GSPC', start=df['Date'].min(), end=df['Date'].max())
nasdaq = yf.download('^IXIC', start=df['Date'].min(), end=df['Date'].max())
df['Returns'] = df['Close'].pct_change() * 100
vix['Returns'] = vix['Adj Close'].pct_change() * 100
sp500['Returns'] = sp500['Adj Close'].pct_change() * 100
nasdaq['Returns'] = nasdaq['Adj Close'].pct_change() * 100

df = df.merge(vix[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_VIX'))
df = df.merge(sp500[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_SP500'))
df = df.merge(nasdaq[['Returns']], how='left', left_on='Date', right_index=True, suffixes=('', '_Nasdaq'))

df.fillna(0, inplace=True)
df.tail(3)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Volume,Returns,Returns_VIX,Returns_SP500,Returns_Nasdaq
2009,2019-12-27,29.0,29.020666,28.407333,28.691999,149185500,-0.129952,6.166014,0.003398,-0.174782
2010,2019-12-30,28.586,28.6,27.284,27.646667,188796000,-3.643287,10.349958,-0.578082,-0.67317
2011,2019-12-31,27.0,28.086,26.805332,27.888666,154285500,0.875327,0.0,0.0,0.0


In [2]:

# Define a function to classify returns
def classify_returns(returns):
    if returns > 2.5:
        return 2  # VERY HIGH
    elif returns > 0.5:
        return 1  # HIGH
    elif returns < -2.5:
        return -2  # VERY LOW
    elif returns < -0.5:
        return -1  # LOW
    else:
        return 0  # NEUTRAL

# Create the ES, MS, IV, MF, and SF features based on TSLA, VIX, S&P 500, and Nasdaq returns
df['ES'] = df['Returns'].apply(classify_returns)
df['MS'] = df['Returns_SP500'].apply(classify_returns)
df['IV'] = df['Returns_VIX'].apply(classify_returns)
df['MF'] = df['Returns_SP500'].apply(classify_returns)
df['SF'] = df['Returns_Nasdaq'].apply(classify_returns)

df.tail(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Returns,Returns_VIX,Returns_SP500,Returns_Nasdaq,ES,MS,IV,MF,SF
2009,2019-12-27,29.0,29.020666,28.407333,28.691999,149185500,-0.129952,6.166014,0.003398,-0.174782,0,0,2,0,0
2010,2019-12-30,28.586,28.6,27.284,27.646667,188796000,-3.643287,10.349958,-0.578082,-0.67317,-2,-1,2,-1,-1
2011,2019-12-31,27.0,28.086,26.805332,27.888666,154285500,0.875327,0.0,0.0,0.0,1,0,0,0,0


In [3]:

def get_forecasts(returns, date_series):
    monthly_forecast = []
    months = date_series.dt.to_period('M').unique()

    for i in range(len(months) - 1):
        current_month = months[i]
        next_month = months[i + 1]
        next_month_return = returns[date_series.dt.to_period('M') == next_month].mean()
        classification = classify_returns(next_month_return)
        monthly_forecast.extend([classification] * sum(date_series.dt.to_period('M') == current_month))

    # For the last month of 2029 it was COVID
    monthly_forecast.extend([-2] * sum(date_series.dt.to_period('M') == months[-1]))

    return monthly_forecast

df['MFCST'] = get_forecasts(df['Returns_SP500'], df['Date'])
df['EFCST'] = get_forecasts(df['Returns'], df['Date'])
df.tail(3)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Returns,Returns_VIX,Returns_SP500,Returns_Nasdaq,ES,MS,IV,MF,SF,MFCST,EFCST
2009,2019-12-27,29.0,29.020666,28.407333,28.691999,149185500,-0.129952,6.166014,0.003398,-0.174782,0,0,2,0,0,-2,-2
2010,2019-12-30,28.586,28.6,27.284,27.646667,188796000,-3.643287,10.349958,-0.578082,-0.67317,-2,-1,2,-1,-1,-2,-2
2011,2019-12-31,27.0,28.086,26.805332,27.888666,154285500,0.875327,0.0,0.0,0.0,1,0,0,0,0,-2,-2


In [4]:
starting_date = '2012-01-01'
ending_date = '2020-01-01'
splitting_date = '2018-01-01'

starting_date = pd.to_datetime(starting_date)
ending_date = pd.to_datetime(ending_date)
splitting_date = pd.to_datetime(splitting_date)

df_train = df[(df['Date'] >= starting_date) & (df['Date'] < splitting_date)]
df_test = df[(df['Date'] >= splitting_date) & (df['Date'] <= ending_date)]

train_path = f'./data/{TICKER}_aug_2012-1-1_2018-1-1.csv'
test_path = f'./data/{TICKER}_aug_2018-1-1_2020-1-1.csv'

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)

train_path, test_path


('./data/TSLA_aug_2012-1-1_2018-1-1.csv',
 './data/TSLA_aug_2018-1-1_2020-1-1.csv')