In [1]:
import hopsworks
import os
from dotenv import load_dotenv

import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt


import yfinance as yf



# Load the .env file
load_dotenv()

# Get the environment variables
tiingo_api_key = os.getenv("TIINGO_API_KEY")
hopsworks_api_key = os.getenv("HOPSWORKS_API_KEY")


In [2]:
def fetch_historical_data(ticker: str = "AMZN", period: str = "2y", interval: str = "1h") -> pd.DataFrame: 
    """
    Fetch historical stock data for a given ticker, period, and interval.

    Parameters:
    ticker (str): The stock ticker symbol (default is "AMZN").
    period (str): The period over which to fetch data (default is "2y").
    interval (str): The interval between data points (default is "1h").

    Returns:
    pd.DataFrame: A DataFrame containing the historical stock data with the following modifications:
        - Index reset to convert the date index into a column.
        - Time zone information removed from the 'Datetime' column.
        - Column names converted to lower case.
        - An 'id' column added as a primary key, which is a string representation of the 'datetime' column.
    """
    data = pd.DataFrame(yf.download(tickers=ticker, period=period, interval=interval, multi_level_index=False))

    # Reset the index to convert the date index into a column
    data = data.reset_index()

    # Remove the time zone information from the 'Datetime' column
    data['Datetime'] = pd.to_datetime(data['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S'))

    # Rename columns to lower case for consistency
    data.columns = [column.lower() for column in data.columns]

    # Add the 'id' column as a primary key, which is a string representation of the 'datetime' column
    data["id"] = [str(date) for date in data['datetime']]
    
    return data

In [3]:
df = fetch_historical_data(period="20d", interval="1h")
df.head()

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Unnamed: 0,datetime,close,high,low,open,volume,id
0,2025-01-24 14:30:00,234.524994,236.399994,234.321503,234.970001,5395617,2025-01-24 14:30:00
1,2025-01-24 15:30:00,234.410004,235.050003,233.970001,234.550003,2600103,2025-01-24 15:30:00
2,2025-01-24 16:30:00,233.759995,234.794998,233.679993,234.429993,1971929,2025-01-24 16:30:00
3,2025-01-24 17:30:00,233.985001,234.085007,233.559998,233.764999,2405786,2025-01-24 17:30:00
4,2025-01-24 18:30:00,233.339996,234.100006,232.929993,233.984802,2437713,2025-01-24 18:30:00


In [4]:
# Add technical indicators

from ta.momentum import RSIIndicator
from ta.trend import CCIIndicator

def calculate_indicators(data: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates RSI and CCI indicators for the given stock data.

    Args:
        data (pd.DataFrame): The stock data.

    Returns:
        pd.DataFrame: The stock data with RSI and CCI indicators.
    """
    rsi = RSIIndicator(data['close']).rsi()
    cci = CCIIndicator(data['high'], data['low'], data['close']).cci()
    data['rsi'] = rsi
    data['cci'] = cci

    return data.dropna()

In [None]:
df = calculate_indicators(df)


Unnamed: 0,datetime,close,high,low,open,volume,id,rsi,cci
132,2025-02-20 20:30:00,222.869995,223.119995,222.360001,222.707306,3248306,2025-02-20 20:30:00,37.522895,-91.81368
133,2025-02-21 14:30:00,219.494995,223.309998,219.410004,223.235001,10077148,2025-02-21 14:30:00,27.479762,-180.835157
134,2025-02-21 15:30:00,219.264999,219.839996,218.070007,219.480804,6202619,2025-02-21 15:30:00,26.950381,-214.759373
135,2025-02-21 16:30:00,217.630005,219.740005,217.399994,219.25,4778204,2025-02-21 16:30:00,23.486556,-201.945381
136,2025-02-21 17:30:00,216.049698,217.690598,215.949997,217.630005,7934021,2025-02-21 17:30:00,20.715212,-215.456538
137,2025-02-21 18:30:00,214.960007,216.5,214.740005,216.039993,6070526,2025-02-21 18:30:00,19.046318,-207.864825
138,2025-02-21 19:30:00,216.439896,216.479904,214.750107,214.970001,5888288,2025-02-21 19:30:00,27.579504,-155.573258
139,2025-02-21 20:30:00,216.490005,216.850006,215.630005,216.440002,5715615,2025-02-21 20:30:00,27.856803,-121.195865


In [None]:
# get yesterday's data
yesterday_data = df.iloc[-7:, :]

Unnamed: 0,datetime,close,high,low,open,volume,id,rsi,cci
133,2025-02-21 14:30:00,219.494995,223.309998,219.410004,223.235001,10077148,2025-02-21 14:30:00,27.479762,-180.835157
134,2025-02-21 15:30:00,219.264999,219.839996,218.070007,219.480804,6202619,2025-02-21 15:30:00,26.950381,-214.759373
135,2025-02-21 16:30:00,217.630005,219.740005,217.399994,219.25,4778204,2025-02-21 16:30:00,23.486556,-201.945381
136,2025-02-21 17:30:00,216.049698,217.690598,215.949997,217.630005,7934021,2025-02-21 17:30:00,20.715212,-215.456538
137,2025-02-21 18:30:00,214.960007,216.5,214.740005,216.039993,6070526,2025-02-21 18:30:00,19.046318,-207.864825
138,2025-02-21 19:30:00,216.439896,216.479904,214.750107,214.970001,5888288,2025-02-21 19:30:00,27.579504,-155.573258
139,2025-02-21 20:30:00,216.490005,216.850006,215.630005,216.440002,5715615,2025-02-21 20:30:00,27.856803,-121.195865


### Login to Hopsworks

In [7]:
project = hopsworks.login(api_key_value=str(hopsworks_api_key))

fs = project.get_feature_store()

# Get featuer group
amazon_fg = fs.get_feature_group("amazon_stock_prices", version=1)

2025-02-22 15:59:12,440 INFO: Initializing external client
2025-02-22 15:59:12,440 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-22 15:59:15,272 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212597


In [8]:
# Insert the data sample to feature group
amazon_fg.insert(yesterday_data)

Uploading Dataframe: 100.00% |██████████| Rows 7/7 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: amazon_stock_prices_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1212597/jobs/named/amazon_stock_prices_1_offline_fg_materialization/executions


(Job('amazon_stock_prices_1_offline_fg_materialization', 'SPARK'), None)