In [14]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS
from finrl.meta.data_processor import DataProcessor

import itertools

In [7]:
TRAIN_START_DATE = '2010-01-01'
TRAIN_END_DATE = '2023-01-01'
TRADE_START_DATE = '2023-01-01'
TRADE_END_DATE = '2025-01-01'

In [8]:
portfolio_symbols = [
    'AAPL', 
    'MSFT', 
    'GOOGL',
    'AMZN'
]

In [9]:
df_raw_data = YahooDownloader(start_date = TRAIN_START_DATE,
                              end_date = TRADE_END_DATE,
                              ticker_list = portfolio_symbols).fetch_data()
    




[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Shape of DataFrame:  (15096, 8)





In [11]:
df_raw_data.head()

Price,date,close,high,low,open,volume,tic,day
0,2010-01-04,6.424606,6.439316,6.375674,6.407195,493729600,AAPL,0
1,2010-01-04,6.695,6.8305,6.657,6.8125,151998000,AMZN,0
2,2010-01-04,15.576999,15.645595,15.514617,15.581969,78169752,GOOGL,0
3,2010-01-04,23.17334,23.28565,22.903795,22.926257,38409100,MSFT,0
4,2010-01-05,6.435713,6.472038,6.40179,6.442318,601904800,AAPL,1


In [15]:
fe = FeatureEngineer(
    use_technical_indicator=True,
    tech_indicator_list = INDICATORS,
    use_vix=True,
    use_turbulence=True,
    user_defined_feature=False
)

processed_data = fe.preprocess_data(df_raw_data)

Successfully added technical indicators


[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (3773, 8)
Successfully added vix
Successfully added turbulence index


In [16]:
processed_data

Unnamed: 0,date,close,high,low,open,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2010-01-04,6.424606,6.439316,6.375674,6.407195,493729600,AAPL,0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.424606,6.424606,20.040001,0.000000
1,2010-01-04,6.695000,6.830500,6.657000,6.812500,151998000,AMZN,0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.695000,6.695000,20.040001,0.000000
2,2010-01-04,15.576999,15.645595,15.514617,15.581969,78169752,GOOGL,0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,15.576999,15.576999,20.040001,0.000000
3,2010-01-04,23.173340,23.285650,22.903795,22.926257,38409100,MSFT,0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,23.173340,23.173340,20.040001,0.000000
4,2010-01-05,6.435713,6.472038,6.401790,6.442318,601904800,AAPL,1,0.000249,6.445867,6.414452,100.000000,66.666667,100.000000,6.430160,6.430160,19.350000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15087,2024-12-27,428.177185,432.841557,424.020036,432.224950,18117700,MSFT,4,2.644090,453.747481,421.911866,50.229915,-12.423805,13.431682,430.670283,423.459639,15.950000,2.255864
15088,2024-12-30,251.307877,252.603281,249.863009,251.337769,35557500,AAPL,0,5.655752,259.448579,237.323919,61.278454,73.552558,24.807705,242.281253,234.964253,17.400000,1.586247
15089,2024-12-30,221.300003,223.000000,218.429993,220.059998,28321200,AMZN,0,4.530765,235.668622,213.004378,56.705402,25.019739,7.839666,217.232333,205.369667,17.400000,1.586247
15090,2024-12-30,190.618561,191.924301,188.505440,189.183238,14264700,GOOGL,0,5.354665,204.980548,167.008720,58.499255,62.932870,14.904822,180.720600,174.452619,17.400000,1.586247


In [17]:
list_ticker = processed_data["tic"].unique().tolist()
list_date = list(pd.date_range(processed_data['date'].min(), processed_data['date'].max()).astype(str))
combination = list(itertools.product(list_date, list_ticker))

final_processed_data = pd.DataFrame(combination, columns = ['date', 'tic']).merge(processed_data, on = ['date', 'tic'], how = 'left')
final_processed_data = final_processed_data[final_processed_data['date'].isin(processed_data['date'])]
final_processed_data = final_processed_data.sort_values(['date','tic'])

final_processed_data = final_processed_data.fillna(0)

In [18]:
final_processed_data

Unnamed: 0,date,tic,close,high,low,open,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2010-01-04,AAPL,6.424606,6.439316,6.375674,6.407195,493729600.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.424606,6.424606,20.040001,0.000000
1,2010-01-04,AMZN,6.695000,6.830500,6.657000,6.812500,151998000.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.695000,6.695000,20.040001,0.000000
2,2010-01-04,GOOGL,15.576999,15.645595,15.514617,15.581969,78169752.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,15.576999,15.576999,20.040001,0.000000
3,2010-01-04,MSFT,23.173340,23.285650,22.903795,22.926257,38409100.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,23.173340,23.173340,20.040001,0.000000
4,2010-01-05,AAPL,6.435713,6.472038,6.401790,6.442318,601904800.0,1.0,0.000249,6.445867,6.414452,100.000000,66.666667,100.000000,6.430160,6.430160,19.350000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21887,2024-12-27,MSFT,428.177185,432.841557,424.020036,432.224950,18117700.0,4.0,2.644090,453.747481,421.911866,50.229915,-12.423805,13.431682,430.670283,423.459639,15.950000,2.255864
21896,2024-12-30,AAPL,251.307877,252.603281,249.863009,251.337769,35557500.0,0.0,5.655752,259.448579,237.323919,61.278454,73.552558,24.807705,242.281253,234.964253,17.400000,1.586247
21897,2024-12-30,AMZN,221.300003,223.000000,218.429993,220.059998,28321200.0,0.0,4.530765,235.668622,213.004378,56.705402,25.019739,7.839666,217.232333,205.369667,17.400000,1.586247
21898,2024-12-30,GOOGL,190.618561,191.924301,188.505440,189.183238,14264700.0,0.0,5.354665,204.980548,167.008720,58.499255,62.932870,14.904822,180.720600,174.452619,17.400000,1.586247


In [19]:
train = data_split(final_processed_data, TRAIN_START_DATE, TRAIN_END_DATE)
trade = data_split(final_processed_data, TRADE_START_DATE, TRADE_END_DATE)
print(len(train))
print(len(trade))

13088
2004


In [25]:
train

Unnamed: 0,date,tic,close,high,low,open,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2010-01-04,AAPL,6.424606,6.439316,6.375674,6.407195,493729600.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.424606,6.424606,20.040001,0.000000
0,2010-01-04,AMZN,6.695000,6.830500,6.657000,6.812500,151998000.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,6.695000,6.695000,20.040001,0.000000
0,2010-01-04,GOOGL,15.576999,15.645595,15.514617,15.581969,78169752.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,15.576999,15.576999,20.040001,0.000000
0,2010-01-04,MSFT,23.173340,23.285650,22.903795,22.926257,38409100.0,0.0,0.000000,6.445867,6.414452,100.000000,66.666667,100.000000,23.173340,23.173340,20.040001,0.000000
1,2010-01-05,AAPL,6.435713,6.472038,6.401790,6.442318,601904800.0,1.0,0.000249,6.445867,6.414452,100.000000,66.666667,100.000000,6.430160,6.430160,19.350000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3270,2022-12-29,MSFT,235.822784,236.713202,230.578146,230.578146,19770700.0,3.0,-1.317345,253.969008,227.311662,48.757455,-79.801944,5.313316,240.140495,234.343710,21.440001,1.905451
3271,2022-12-30,AAPL,128.123062,128.142788,125.657837,126.624212,77034200.0,4.0,-4.343328,148.692628,122.196968,41.597135,-121.278276,29.024002,138.980270,140.738595,21.670000,0.309391
3271,2022-12-30,AMZN,84.000000,84.050003,82.470001,83.120003,62401200.0,4.0,-3.347538,94.472016,80.645983,39.422059,-115.829424,25.711482,89.725999,97.755667,21.670000,0.309391
3271,2022-12-30,GOOGL,87.625641,87.695161,85.977008,86.384203,23986300.0,4.0,-2.270884,99.523770,82.980479,42.973902,-104.403756,16.912089,93.197540,94.108424,21.670000,0.309391


In [27]:
with open('/train_data', 'w') as f:
    train.to_csv('train_data.csv', index=False)

with open('/test_data', 'w') as f:
    trade.to_csv('test_data.csv', index=False)