In [None]:
# Install ta-lib library for Technical indicators
!curl -L http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz -O && tar xzvf ta-lib-0.4.0-src.tar.gz
!cd ta-lib && ./configure --prefix=/usr && make && make install && cd - && pip install ta-lib

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   380  100   380    0     0   2332      0 --:--:-- --:--:-- --:--:--  2345
100   359  100   359    0     0   1145      0 --:--:-- --:--:-- --:--:--  1145
100 1299k  100 1299k    0     0  1655k      0 --:--:-- --:--:-- --:--:-- 3199k
ta-lib/
ta-lib/config.sub
ta-lib/aclocal.m4
ta-lib/CHANGELOG.TXT
ta-lib/include/
ta-lib/include/ta_abstract.h
ta-lib/include/ta_func.h
ta-lib/include/ta_common.h
ta-lib/include/ta_config.h.in
ta-lib/include/Makefile.am
ta-lib/include/ta_libc.h
ta-lib/include/ta_defs.h
ta-lib/missing
ta-lib/ta-lib.spec.in
ta-lib/config.guess
ta-lib/Makefile.in
ta-lib/ta-lib.dpkg.in
ta-lib/Makefile.am
ta-lib/autogen.sh
ta-lib/install-sh
ta-lib/configure
ta-lib/depcomp
ta-lib/HISTORY.TXT
ta-lib/configure.in
ta-lib/autom4te.cache/
ta-lib/autom4te.cache/output.0
ta-lib/autom4te.cache/requests
ta-lib/autom4te.cache/outpu

In [None]:
import datetime as dt
from pandas_datareader import data as pdr
import yfinance as yf
import pandas as pd

In [167]:
# read companies stock data inserting needed ticker value 'AAPL', 'MSFT', 'GOOG', 'AMZN', 'NVDA', 'META', 'TSLA', 'BRK-B', 'LLY', 'V'

company_ticker = "V"
yf.pdr_override()
start = dt.datetime(2012,1,1)
end = dt.datetime(2023,11,30)
df = pdr.get_data_yahoo(company_ticker, start, end)

[*********************100%%**********************]  1 of 1 completed


In [168]:
# preprocess data
df.drop(['Close'], axis=1)
df = df[['Open', 'High', 'Low', 'Adj Close', 'Volume']]
df.columns = ['open', 'high', 'low', 'close', 'volume']
df.volume /= 1e3 # sadalam apjomu ar 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.volume /= 1e3 # sadalam apjomu ar 1000


In [169]:
# create attributes

# add price time lags
lags = [1, 2, 3, 4, 5, 10, 21, 42, 63]
for i in lags:
  df["lag_{}".format(i)] = df.close.shift(i)

# add return time lags
for lag in lags:
    df[f'return_{lag}d'] = df.close.pct_change(lag)

# add technical indicators from talib library
from talib import RSI, BBANDS, MACD, ATR, SMA, EMA

# RSI
df['rsi'] =  RSI(df.close, timeperiod=14)

#Bollinger Bands
upperband, middleband, lowerband = BBANDS(df.close, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
df['bb_high'] = upperband
df['bb_low'] = lowerband

# ATR
df['atr'] = ATR(df.high, df.low, df.close, timeperiod=14)

# MACD
macd, macdsignal, macdhist = MACD(df.close, fastperiod=12, slowperiod=26, signalperiod=9)
df['macd'] = macd

# SMA
df['sma_10'] = SMA(df.close, timeperiod=30)
df['sma_20'] = SMA(df.close, timeperiod=20)
df['sma_50'] = SMA(df.close, timeperiod=50)

# EMA
df['ema_10'] = EMA(df.close, timeperiod=30)
df['ema_20'] = EMA(df.close, timeperiod=30)
df['ema_50'] = EMA(df.close, timeperiod=50)

# add weekday categorical attributes
df['weekday'] = df.index.get_level_values('Date').weekday
df = pd.get_dummies(df, columns=['weekday'],
                        prefix=['weekday'],
                        prefix_sep=['_'],
                        drop_first=False)

# add month categorical attributes
df['month'] = df.index.get_level_values('Date').month
df = pd.get_dummies(df,columns=['month'],
                    prefix=['month'],
                    prefix_sep=['_'],drop_first=False)

In [170]:
# create target variables

# for regression task
# create target variable for price prediction by shifting next day's close price one day back
df['target_close'] = df.close.shift(-1)

# create target variable for 1 day return prediction by shifting next day's 1 day return value one day back
df['target_return_1d'] = df.return_1d.shift(-1)

# for classification task
df['target_class'] = df.close-df.open
df['target_class'] = df.target_class.shift(-1)
df['target_class'] = [1 if df['target_class'][i] > 0 else 0 for i in range(len(df))]

In [171]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'lag_1', 'lag_2', 'lag_3',
       'lag_4', 'lag_5', 'lag_10', 'lag_21', 'lag_42', 'lag_63', 'return_1d',
       'return_2d', 'return_3d', 'return_4d', 'return_5d', 'return_10d',
       'return_21d', 'return_42d', 'return_63d', 'rsi', 'bb_high', 'bb_low',
       'atr', 'macd', 'sma_10', 'sma_20', 'sma_50', 'ema_10', 'ema_20',
       'ema_50', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3',
       'weekday_4', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'target_close', 'target_return_1d', 'target_class'],
      dtype='object')

In [172]:
df.describe()

Unnamed: 0,open,high,low,close,volume,lag_1,lag_2,lag_3,lag_4,lag_5,...,month_6,month_7,month_8,month_9,month_10,month_11,month_12,target_close,target_return_1d,target_class
count,2997.0,2997.0,2997.0,2997.0,2997.0,2996.0,2995.0,2994.0,2993.0,2992.0,...,2997.0,2997.0,2997.0,2997.0,2997.0,2997.0,2997.0,2996.0,2996.0,2997.0
mean,127.029602,128.186009,125.866699,123.413722,9365.972973,123.370059,123.326797,123.283105,123.23933,123.195721,...,0.085085,0.084418,0.089089,0.081415,0.088088,0.081748,0.077077,123.447026,0.00091,0.061061
std,71.087542,71.799787,70.385513,71.176132,5074.976169,71.147857,71.120314,71.091981,71.063492,71.035303,...,0.279055,0.27806,0.28492,0.273517,0.28347,0.274027,0.266758,71.164656,0.015289,0.239482
min,24.745001,24.875,24.5825,22.747015,1640.9,22.747015,22.747015,22.747015,22.747015,22.747015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.747015,-0.135472,0.0
25%,64.629997,65.162498,64.067497,60.668095,6260.8,60.662222,60.656349,60.650476,60.644604,60.642254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.713737,-0.006788,0.0
50%,112.599998,113.440002,112.080002,108.19091,8207.6,108.147713,108.104515,108.099712,108.09491,108.008503,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,108.301327,0.001347,0.0
75%,200.0,202.539993,197.979996,196.704788,11142.3,196.694729,196.682304,196.663795,196.635483,196.462769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,196.733948,0.008588,0.0
max,254.300003,256.070007,253.520004,254.300003,90073.6,254.300003,254.300003,254.300003,253.720001,252.229996,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,254.300003,0.138426,1.0


In [173]:
# get rid of rows with NA values as they were created while shifting data
df = df.dropna(axis=0)

In [174]:
df.describe()

Unnamed: 0,open,high,low,close,volume,lag_1,lag_2,lag_3,lag_4,lag_5,...,month_6,month_7,month_8,month_9,month_10,month_11,month_12,target_close,target_return_1d,target_class
count,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,...,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0,2933.0
mean,129.120937,130.296646,127.93791,125.473507,9252.157893,125.396595,125.319198,125.241825,125.164677,125.088073,...,0.086942,0.08626,0.091033,0.083191,0.09001,0.083191,0.078759,125.550749,0.000874,0.062393
std,70.296524,71.003543,69.599311,70.428971,5006.264099,70.412895,70.396092,70.378991,70.362125,70.346044,...,0.281798,0.280795,0.287705,0.276218,0.286245,0.276218,0.269408,70.44591,0.01534,0.24191
min,28.0075,28.610001,27.985001,25.860882,1640.9,25.860882,25.860882,25.860882,25.860882,25.860882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.860882,-0.135472,0.0
25%,66.252502,66.82,65.75,62.298462,6227.3,62.248859,62.239647,62.21833,62.169048,62.138454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.305229,-0.006778,0.0
50%,119.269997,120.589996,117.800003,115.044197,8121.2,114.873039,114.640251,114.575409,114.354614,114.245934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.05545,0.001346,0.0
75%,200.880005,203.5,199.100006,197.49176,10999.5,197.397736,197.356842,197.306976,197.28833,197.268051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,197.540985,0.008522,0.0
max,254.300003,254.979996,253.5,254.300003,90073.6,254.300003,254.300003,253.720001,252.229996,249.970001,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,254.300003,0.138426,1.0


In [175]:
df.to_csv(f"{company_ticker}_data.csv")