In [1]:
from dotenv import load_dotenv
import os

In [2]:
load_dotenv("../.env")

True

In [3]:
api_key = os.getenv("ALPHA_VANTAGE_API_KEY")

## Intraday Stock Market Data API

In [4]:
function = "TIME_SERIES_INTRADAY"
symbol = "MSFT"
interval = "5min"
datatype = "csv"
outputsize = "full"

In [5]:
from datetime import datetime

current_year = datetime.now().year

In [16]:
import requests

for i in [current_year - 2, current_year - 1, current_year]:
    for j in range(1, 13):
        period = f"{i}-{j:02d}"
        endpoint = f"https://www.alphavantage.co/query?function={function}&symbol={symbol}&interval={interval}&outputsize={outputsize}&apikey={api_key}&datatype={datatype}&month={period}"
        r = requests.get(url=endpoint)

        if not os.path.exists(f"../data/{i}/{j:02d}"):
            os.makedirs(f"../data/{i}/{j:02d}")
            
        with open(f"../data/{i}/{j:02d}/{symbol}_{interval}.csv", "w") as f:
            f.write(r.text)


## News & Sentiments API

In [11]:
import requests

function = "NEWS_SENTIMENT"
tickers = "MSFT"
time_from = "20230101T0400"
time_to = "20250115T1955"
limit=1000

endpoint = f"https://www.alphavantage.co/query?function={function}&tickers={tickers}&apikey={api_key}&time_from={time_from}&time_to={time_to}&limit={limit}"

r = requests.get(url=endpoint)

In [12]:
r.json()

{'items': '685',
 'sentiment_score_definition': 'x <= -0.35: Bearish; -0.35 < x <= -0.15: Somewhat-Bearish; -0.15 < x < 0.15: Neutral; 0.15 <= x < 0.35: Somewhat_Bullish; x >= 0.35: Bullish',
 'relevance_score_definition': '0 < x <= 1, with a higher score indicating higher relevance.',
 'feed': [{'title': 'Is Amazon the More Resilient Pick Over NVDA Stock in the AI Revolution?',
   'url': 'https://www.zacks.com/stock/news/2397737/is-amazon-the-more-resilient-pick-over-nvda-stock-in-the-ai-revolution',
   'time_published': '20250115T194500',
   'authors': ['Nilanjan Banerjee'],
   'summary': "Amazon's diversified business makes it a more stable pick than NVIDIA, which relies heavily on AI.",
   'banner_image': 'https://staticx-tuner.zacks.com/images/articles/main/f3/74235.jpg',
   'source': 'Zacks Commentary',
   'category_within_source': 'n/a',
   'source_domain': 'www.zacks.com',
   'topics': [{'topic': 'Retail & Wholesale', 'relevance_score': '0.333333'},
    {'topic': 'Financial Mar

In [13]:
if not os.path.exists(f"../data/news"):
    os.makedirs(f"../data/news")

with open(f"../data/news/{tickers}.json", "w") as f:
    f.write(r.text)

In [None]:
from datetime import datetime
import json

for feed in r.json()["feed"]:
    date = datetime.strptime(feed["time_published"], "%Y%m%dT%H%M%S")
    year = date.year
    month = date.month

    if not os.path.exists(f"../data/news/{year}/{month:02d}"):
        os.makedirs(f"../data/news/{year}/{month:02d}")

    with open(f"../data/news/{year}/{month:02d}/MSFT_news_{date.hour}-{date.minute}.json", "w") as f:
        f.write(json.dumps(feed, ensure_ascii=False, indent=4))

## Feature Engineering

In [20]:
import pandas as pd
import glob

In [21]:
df = pd.read_csv("../data/2023/01/MSFT_5min.csv", parse_dates=["timestamp"])
df = df.sort_values("timestamp")
df = df.reset_index(drop=True)

In [23]:
df_small = df.head()

In [24]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737


In [25]:
df_small['sma_3'] = df_small['close'].rolling(window=3).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['sma_3'] = df_small['close'].rolling(window=3).mean()


In [26]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433


In [27]:
df_small['ema_3'] = df_small['close'].ewm(span=3, adjust=False).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['ema_3'] = df_small['close'].ewm(span=3, adjust=False).mean()


In [28]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019


In [31]:
def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df_small['rsi_14'] = calculate_rsi(df_small['close'], period= 14)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['rsi_14'] = calculate_rsi(df_small['close'], period= 14)


In [32]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,


In [33]:
df_small['price_change'] = df_small['close'].pct_change()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['price_change'] = df_small['close'].pct_change()


In [34]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14,price_change
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,,
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,,-0.001235
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,,0.001567
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,,0.001688
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,,-0.00037


In [36]:
df_small['volatility_3'] = df_small['close'].rolling(window=3).std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['volatility_3'] = df_small['close'].rolling(window=3).std()


In [38]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14,price_change,volatility_3
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,,,
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,,-0.001235,
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,,0.001567,0.197064
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,,0.001688,0.388694
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,,-0.00037,0.212024


In [39]:
df_small['hour'] = df_small['timestamp'].dt.hour
df_small['day_of_week'] = df_small['timestamp'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['hour'] = df_small['timestamp'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['day_of_week'] = df_small['timestamp'].dt.dayofweek


In [40]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14,price_change,volatility_3,hour,day_of_week
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,,,,4,1
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,,-0.001235,,4,1
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,,0.001567,0.197064,4,1
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,,0.001688,0.388694,4,1
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,,-0.00037,0.212024,4,1


In [41]:
for lag in range(1, 4):
    df_small[f'close_lag_{lag}'] = df_small['close'].shift(lag)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small[f'close_lag_{lag}'] = df_small['close'].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small[f'close_lag_{lag}'] = df_small['close'].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small[f'close_lag_{lag}'] = df_small['close'].shift(lag)


In [42]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14,price_change,volatility_3,hour,day_of_week,close_lag_1,close_lag_2,close_lag_3
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,,,,4,1,,,
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,,-0.001235,,4,1,238.8933,,
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,,0.001567,0.197064,4,1,238.5982,238.8933,
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,,0.001688,0.388694,4,1,238.972,238.5982,238.8933
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,,-0.00037,0.212024,4,1,239.3754,238.972,238.5982


In [43]:
df_small['trend'] = (df_small['close'].shift(-1) > df_small['close']).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['trend'] = (df_small['close'].shift(-1) > df_small['close']).astype(int)


In [44]:
df_small

Unnamed: 0,timestamp,open,high,low,close,volume,sma_3,ema_3,rsi_14,price_change,volatility_3,hour,day_of_week,close_lag_1,close_lag_2,close_lag_3,trend
0,2023-01-03 04:00:00,237.1125,238.8933,237.1125,238.8933,532,,238.8933,,,,4,1,,,,0
1,2023-01-03 04:05:00,238.8343,238.8343,238.5883,238.5982,1265,,238.74575,,-0.001235,,4,1,238.8933,,,1
2,2023-01-03 04:10:00,238.5982,238.972,238.1062,238.972,7306,238.821167,238.858875,,0.001567,0.197064,4,1,238.5982,238.8933,,1
3,2023-01-03 04:15:00,238.8146,239.3754,238.8146,239.3754,730,238.981867,239.117138,,0.001688,0.388694,4,1,238.972,238.5982,238.8933,0
4,2023-01-03 04:20:00,238.9819,239.2869,238.9819,239.2869,737,239.211433,239.202019,,-0.00037,0.212024,4,1,239.3754,238.972,238.5982,0


In [45]:
# For large dataset to drop rolling and lagged features NaN values
#df_small = df_small.dropna()