In [17]:
import yfinance as yf
import pandas as pd
import numpy as np



In [None]:
tickers = "AAPL NVDA MSFT"

data = yf.download(tickers, period="max", interval="1h", group_by='ticker', threads=True, auto_adjust=True)



In [13]:
data['AAPL'].head()

Price,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-10-24 13:30:00+00:00,147.25,148.350006,146.0,146.630203,20169319
2022-10-24 14:30:00+00:00,146.630096,148.449997,146.470001,148.210007,10131448
2022-10-24 15:30:00+00:00,148.219894,148.820007,147.529999,148.436096,7921370
2022-10-24 16:30:00+00:00,148.434998,148.729996,147.910004,148.080002,6399968
2022-10-24 17:30:00+00:00,148.070007,149.365005,147.929993,149.285004,5909741


In [14]:
data.shape

(3484, 15)

In [21]:
data.columns

MultiIndex([('AAPL',   'Open'),
            ('AAPL',   'High'),
            ('AAPL',    'Low'),
            ('AAPL',  'Close'),
            ('AAPL', 'Volume'),
            ('MSFT',   'Open'),
            ('MSFT',   'High'),
            ('MSFT',    'Low'),
            ('MSFT',  'Close'),
            ('MSFT', 'Volume'),
            ('NVDA',   'Open'),
            ('NVDA',   'High'),
            ('NVDA',    'Low'),
            ('NVDA',  'Close'),
            ('NVDA', 'Volume')],
           names=['Ticker', 'Price'])

In [16]:
# Save the data
for symbol in tickers.split(" "):
    symbol_df = data[symbol]
    symbol_df.to_csv(f'{symbol}_hourly_data.csv')
    print(f"Data for {symbol} saved to '{symbol}_hourly_data.csv'")

Data for AAPL saved to 'AAPL_hourly_data.csv'
Data for NVDA saved to 'NVDA_hourly_data.csv'
Data for MSFT saved to 'MSFT_hourly_data.csv'


In [24]:
print(data.index.names)

['Datetime']


In [25]:
print(data.columns)

MultiIndex([('AAPL',   'Open'),
            ('AAPL',   'High'),
            ('AAPL',    'Low'),
            ('AAPL',  'Close'),
            ('AAPL', 'Volume'),
            ('MSFT',   'Open'),
            ('MSFT',   'High'),
            ('MSFT',    'Low'),
            ('MSFT',  'Close'),
            ('MSFT', 'Volume'),
            ('NVDA',   'Open'),
            ('NVDA',   'High'),
            ('NVDA',    'Low'),
            ('NVDA',  'Close'),
            ('NVDA', 'Volume')],
           names=['Ticker', 'Price'])


In [27]:
data_long = data.stack(level='Ticker').reset_index()
data_long.columns = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume']
data_long.head()

  data_long = data.stack(level='Ticker').reset_index()


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume
0,2022-10-24 13:30:00+00:00,AAPL,147.25,148.350006,146.0,146.630203,20169319
1,2022-10-24 13:30:00+00:00,MSFT,243.648895,245.729996,241.294998,242.479996,6780395
2,2022-10-24 13:30:00+00:00,NVDA,12.485,12.488,12.064,12.147631,12959042
3,2022-10-24 14:30:00+00:00,AAPL,146.630096,148.449997,146.470001,148.210007,10131448
4,2022-10-24 14:30:00+00:00,MSFT,242.485001,244.597198,242.274994,244.520004,3068392


In [36]:
data_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10452 entries, 0 to 10451
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype              
---  ------  --------------  -----              
 0   Date    10452 non-null  datetime64[ns, UTC]
 1   Ticker  10452 non-null  object             
 2   Open    10452 non-null  float64            
 3   High    10452 non-null  float64            
 4   Low     10452 non-null  float64            
 5   Close   10452 non-null  float64            
 6   Volume  10452 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(4), int64(1), object(1)
memory usage: 571.7+ KB


In [33]:
def engineer_features(df):
    df = df.sort_values(by=['Ticker', 'Date'])

    # Returns
    df['Returns'] = df.groupby('Ticker')['Close'].pct_change()
    df['Log_Returns'] = df.groupby('Ticker')['Close'].transform(lambda x: np.log(x / x.shift(1)))

    # Volume features
    df['Volume_Change'] = df.groupby('Ticker')['Volume'].pct_change()
    df['Volume_MA_Ratio'] = df.groupby('Ticker')['Volume'].transform(lambda x: x / x.rolling(window=10).mean())

    # Price momentum
    for window in [1, 3, 5, 10]:
        df[f'Momentum_{window}'] = df.groupby('Ticker')['Close'].transform(lambda x: x.pct_change(periods=window))

    # Close Position to High and Low
    df['Bar_Range'] = df['High'] - df['Low']
    df['Close_Position'] = (df['Close'] - df['Low']) / df['Bar_Range']
    # if high and low are equal, set close position to 0.5
    df.loc[df['Bar_Range'] == 0, 'Close_Position'] = 0.5
    df['Close_Position_Category'] = pd.cut(df['Close_Position'], 
                                           bins=[0, 0.2, 0.4, 0.6, 0.8, 1], 
                                           labels=['Very Low', 'Low', 'Middle', 'High', 'Very High'])

    # Our label: 1 if next hour's price is higher, else 0
    df['Target'] = df.groupby('Ticker')['Close'].transform(lambda x: (x.shift(-1) > x).astype(int))

    # Drop NaN values
    df.dropna(inplace=True)


    return df

df = engineer_features(data_long)



In [34]:
df.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Returns,Log_Returns,Volume_Change,Volume_MA_Ratio,Momentum_1,Momentum_3,Momentum_5,Momentum_10,Bar_Range,Close_Position,Close_Position_Category,Target
30,2022-10-25 16:30:00+00:00,AAPL,151.789993,151.839996,151.149994,151.494995,5889941,-0.002009,-0.002011,-0.278226,0.647903,-0.002009,0.002283,0.010101,0.033177,0.690002,0.5,Middle,1
33,2022-10-25 17:30:00+00:00,AAPL,151.490005,152.050003,151.154999,152.049301,6496406,0.003659,0.003652,0.102966,0.74438,0.003659,0.003063,0.016644,0.025904,0.895004,0.999216,Very High,1
36,2022-10-25 18:30:00+00:00,AAPL,152.044998,152.300003,151.759995,152.268707,6861939,0.001443,0.001442,0.056267,0.795926,0.001443,0.003088,0.007401,0.02582,0.540009,0.942046,Very High,1
39,2022-10-25 19:30:00+00:00,AAPL,152.264999,152.490005,152.020004,152.380005,7608674,0.000731,0.000731,0.108823,0.870339,0.000731,0.005842,0.005245,0.029038,0.470001,0.765957,High,0
42,2022-10-26 13:30:00+00:00,AAPL,150.960007,151.320007,149.529999,150.660004,22487941,-0.011288,-0.011352,1.955566,2.162298,-0.011288,-0.009137,-0.00751,0.009211,1.790009,0.631285,High,1


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10377 entries, 30 to 10451
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   Date                     10377 non-null  datetime64[ns, UTC]
 1   Ticker                   10377 non-null  object             
 2   Open                     10377 non-null  float64            
 3   High                     10377 non-null  float64            
 4   Low                      10377 non-null  float64            
 5   Close                    10377 non-null  float64            
 6   Volume                   10377 non-null  int64              
 7   Returns                  10377 non-null  float64            
 8   Log_Returns              10377 non-null  float64            
 9   Volume_Change            10377 non-null  float64            
 10  Volume_MA_Ratio          10377 non-null  float64            
 11  Momentum_1               10377 n

In [None]:
df.to_csv('hourly_data_features.csv', index=False)