In [3]:
import yfinance as yf
import datetime
import pandas as pd

ticker = "AAPL"
interval = "1h"

# Yahoo allows max ~60 days per request
days_per_chunk = 59
end_date = datetime.datetime.now()
start_date = end_date - datetime.timedelta(days=365)

all_data = []

# Loop in 59-day windows until we cover 1 year
current_start = start_date
while current_start < end_date:
    current_end = min(current_start + datetime.timedelta(days=days_per_chunk), end_date)

    data = yf.download(
        ticker,
        start=current_start,
        end=current_end,
        interval=interval,
        progress=False
    )

    if not data.empty:
        all_data.append(data)

    # move window forward
    current_start = current_end

# Combine everything
final_data = pd.concat(all_data).sort_index()

print(final_data.head())
print("\nShape:", final_data.shape)

# Save to CSV
final_data.to_csv("AAPL_1yr_hourly.csv")


  data = yf.download(
  data = yf.download(
  data = yf.download(
  data = yf.download(
  data = yf.download(
  data = yf.download(
  data = yf.download(


Price                           Close        High         Low        Open  \
Ticker                           AAPL        AAPL        AAPL        AAPL   
Datetime                                                                    
2024-09-23 13:30:00+00:00  228.009995  228.023102  225.860001  227.339996   
2024-09-23 14:30:00+00:00  228.653397  229.449997  227.630005  228.009995   
2024-09-23 15:30:00+00:00  226.940002  228.899994  226.929993  228.660004   
2024-09-23 16:30:00+00:00  226.740005  227.410004  226.620102  226.940002   
2024-09-23 17:30:00+00:00  226.679993  227.179993  226.509995  226.748993   

Price                        Volume  
Ticker                         AAPL  
Datetime                             
2024-09-23 13:30:00+00:00  13371890  
2024-09-23 14:30:00+00:00   6527851  
2024-09-23 15:30:00+00:00   3575768  
2024-09-23 16:30:00+00:00   3228628  
2024-09-23 17:30:00+00:00   2921492  

Shape: (1734, 5)


In [4]:
print(final_data.tail())

Price                           Close        High         Low        Open  \
Ticker                           AAPL        AAPL        AAPL        AAPL   
Datetime                                                                    
2025-09-19 18:30:00+00:00  245.610001  246.100006  245.190994  245.485001   
2025-09-19 19:30:00+00:00  245.289993  246.300003  244.740005  245.600006   
2025-09-22 13:30:00+00:00  252.671600  252.735001  248.169998  248.339996   
2025-09-22 14:30:00+00:00  255.789795  256.079987  252.080002  252.660004   
2025-09-22 15:30:00+00:00  255.039993  256.630005  254.699997  255.770004   

Price                        Volume  
Ticker                         AAPL  
Datetime                             
2025-09-19 18:30:00+00:00   7490411  
2025-09-19 19:30:00+00:00  12986528  
2025-09-22 13:30:00+00:00  31788879  
2025-09-22 14:30:00+00:00  16983776  
2025-09-22 15:30:00+00:00  10636688  


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# ----------------------------
# Load hourly data
# ----------------------------
data = pd.read_csv("AAPL_1yr_hourly.csv")

# Check columns
print("Columns in CSV:", data.columns.tolist())
print(data.head())

# ----------------------------
# Keep only numeric columns
# ----------------------------
# Many Yahoo Finance exports have 'Ticker' or 'Symbol'. Drop them.
numeric_data = data.select_dtypes(include=[np.number])

# Ensure Close column exists
if "Close" not in numeric_data.columns:
    raise ValueError("No numeric 'Close' column found in your CSV. Please check file!")

# Rebuild with datetime + close only
if "Datetime" in data.columns:
    data["Datetime"] = pd.to_datetime(data["Datetime"])
    close_prices = data[["Datetime"]].join(numeric_data["Close"]).dropna().reset_index(drop=True)
elif "Date" in data.columns:
    data["Date"] = pd.to_datetime(data["Date"])
    close_prices = data[["Date"]].join(numeric_data["Close"]).dropna().reset_index(drop=True)
    close_prices.rename(columns={"Date": "Datetime"}, inplace=True)
else:
    raise ValueError("No 'Date' or 'Datetime' column found in your CSV.")

print(close_prices.head())

# ----------------------------
# Sliding-window Random Forest
# ----------------------------
window_size = 21
results = []

for i in range(len(close_prices) - window_size - 1):
    X_train = close_prices["Close"].iloc[i:i+window_size].values.reshape(1, -1)
    y_train = close_prices["Close"].iloc[i+window_size]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, [y_train])

    X_test = close_prices["Close"].iloc[i+1:i+1+window_size].values.reshape(1, -1)
    y_true = close_prices["Close"].iloc[i+window_size+1]
    y_pred = model.predict(X_test)[0]

    mae = mean_absolute_error([y_true], [y_pred])
    results.append({
        "Datetime": close_prices["Datetime"].iloc[i+window_size+1],
        "Actual": y_true,
        "Predicted": y_pred,
        "MAE": mae
    })

results_df = pd.DataFrame(results)
results_df.to_csv("AAPL_hourly_RF_predictions.csv", index=False)

print(results_df.head())
print("\nSaved to AAPL_hourly_RF_predictions.csv")


Columns in CSV: ['Price', 'Close', 'High', 'Low', 'Open', 'Volume']
                       Price               Close                High  \
0                     Ticker                AAPL                AAPL   
1                   Datetime                 NaN                 NaN   
2  2024-09-23 13:30:00+00:00  228.00999450683594  228.02310180664062   
3  2024-09-23 14:30:00+00:00   228.6533966064453   229.4499969482422   
4  2024-09-23 15:30:00+00:00  226.94000244140625  228.89999389648438   

                  Low                Open    Volume  
0                AAPL                AAPL      AAPL  
1                 NaN                 NaN       NaN  
2  225.86000061035156  227.33999633789062  13371890  
3   227.6300048828125  228.00999450683594   6527851  
4  226.92999267578125  228.66000366210938   3575768  


ValueError: No numeric 'Close' column found in your CSV. Please check file!

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# ----------------------------
# Load hourly Apple stock data
# ----------------------------
data = pd.read_csv("AAPL_1yr_hourly.csv")

# Drop any non-numeric columns like "Ticker" that may contain 'AAPL'
numeric_data = data.select_dtypes(include=[np.number])

# Ensure Close column exists
if "Close" not in numeric_data.columns:
    raise ValueError("No numeric 'Close' column found in your CSV. Check file contents!")

# Build dataframe with Datetime + Close only
if "Datetime" in data.columns:
    data["Datetime"] = pd.to_datetime(data["Datetime"])
    close_prices = data[["Datetime"]].join(numeric_data["Close"]).dropna().reset_index(drop=True)
elif "Date" in data.columns:
    data["Date"] = pd.to_datetime(data["Date"])
    close_prices = data[["Date"]].join(numeric_data["Close"]).dropna().reset_index(drop=True)
    close_prices.rename(columns={"Date": "Datetime"}, inplace=True)
else:
    raise ValueError("No 'Date' or 'Datetime' column found in your CSV.")

# ----------------------------
# Sliding-window Random Forest
# ----------------------------
window_size = 21   # 21-hour training window
results = []

for i in range(len(close_prices) - window_size - 1):
    # Features = last 21 closes
    X_train = close_prices["Close"].iloc[i:i+window_size].values.reshape(1, -1)
    # Target = next close
    y_train = close_prices["Close"].iloc[i+window_size]

    # Train RF
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, [y_train])

    # Predict the following hour
    X_test = close_prices["Close"].iloc[i+1:i+1+window_size].values.reshape(1, -1)
    y_true = close_prices["Close"].iloc[i+window_size+1]
    y_pred = model.predict(X_test)[0]


ValueError: No numeric 'Close' column found in your CSV. Check file contents!

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# ----------------------------
# Load hourly Apple stock data
# ----------------------------
data = pd.read_csv("AAPL_1yr_hourly.csv", header=[0,1], index_col=0)

# Flatten multi-index columns ("Close" under "Price" row)
data.columns = [col[0] for col in data.columns]   # keep top-level names only
data.index = pd.to_datetime(data.index)           # ensure datetime index

# Keep only Close column
close_prices = data[["Close"]].dropna().reset_index()
close_prices.rename(columns={"index": "Datetime"}, inplace=True)

# ----------------------------
# Sliding-window Random Forest
# ----------------------------
window_size = 21   # 21-hour training window
results = []

for i in range(len(close_prices) - window_size - 1):
    # Features = last 21 closes
    X_train = close_prices["Close"].iloc[i:i+window_size].values.reshape(1, -1)
    # Target = next close
    y_train = close_prices["Close"].iloc[i+window_size]

    # Train RF
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, [y_train])

    # Predict the following hour
    X_test = close_prices["Close"].iloc[i+1:i+1+window_size].values.reshape(1, -1)
    y_true = close_prices["Close"].iloc[i+window_size+1]
    y_pred = model.predict(X_test)[0]

    mae = mean_absolute_error([y_true], [y_pred])
    results.append({
        "Datetime": close_prices["Datetime"].iloc[i+window_size+1],
        "Actual": y_true,
        "Predicted": y_pred,
        "MAE": mae
    })

# ----------------------------
# Save results to CSV
# ----------------------------
results_df = pd.DataFrame(results)
output_file = "AAPL_hourly_RF_predictions.csv"
results_df.to_csv(output_file, index=False)

print(f"✅ Results saved to {output_file}")
print(results_df.head())


✅ Results saved to AAPL_hourly_RF_predictions.csv
                   Datetime      Actual   Predicted       MAE
0 2024-09-26 14:30:00+00:00  227.389694  226.899994  0.489700
1 2024-09-26 15:30:00+00:00  227.309998  227.389694  0.079697
2 2024-09-26 16:30:00+00:00  227.154999  227.309998  0.154999
3 2024-09-26 17:30:00+00:00  227.705002  227.154999  0.550003
4 2024-09-26 18:30:00+00:00  227.574997  227.705002  0.130005


In [13]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# ----------------------------
# Load hourly Apple stock data
# ----------------------------
data = pd.read_csv("AAPL_1yr_hourly.csv", header=[0, 1], index_col=0)

# Flatten multi-index columns (keep only first level: Price, Close, High, etc.)
data.columns = [col[0] for col in data.columns]
data.index = pd.to_datetime(data.index)

# Keep only Close column
close_prices = data[["Close"]].dropna().reset_index()
close_prices.rename(columns={"index": "Datetime"}, inplace=True)

# ----------------------------
# Sliding-window XGBoost
# ----------------------------
window_size = 21   # 21-hour training window
results = []

for i in range(len(close_prices) - window_size - 1):
    # Features = last 21 closes
    X_train = close_prices["Close"].iloc[i:i+window_size].values.reshape(1, -1)
    # Target = next close
    y_train = close_prices["Close"].iloc[i+window_size]

    # Train XGBoost model
    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=0
    )
    model.fit(X_train, [y_train])

    # Predict the following hour
    X_test = close_prices["Close"].iloc[i+1:i+1+window_size].values.reshape(1, -1)
    y_true = close_prices["Close"].iloc[i+window_size+1]
    y_pred = model.predict(X_test)[0]

    mae = mean_absolute_error([y_true], [y_pred])
    results.append({
        "Datetime": close_prices["Datetime"].iloc[i+window_size+1],
        "Actual": y_true,
        "Predicted": y_pred,
        "MAE": mae
    })

# ----------------------------
# Save results to CSV
# ----------------------------
results_df = pd.DataFrame(results)
output_file = "AAPL_hourly_XGB_predictions.csv"
results_df.to_csv(output_file, index=False)

print(f"✅ Results saved to {output_file}")
print(results_df.head())


✅ Results saved to AAPL_hourly_XGB_predictions.csv
                   Datetime      Actual   Predicted       MAE
0 2024-09-26 14:30:00+00:00  227.389694  226.899994  0.489700
1 2024-09-26 15:30:00+00:00  227.309998  227.389694  0.079697
2 2024-09-26 16:30:00+00:00  227.154999  227.309998  0.154999
3 2024-09-26 17:30:00+00:00  227.705002  227.154999  0.550003
4 2024-09-26 18:30:00+00:00  227.574997  227.705002  0.130005
