# Transformer Neural Network - DO NOT USE. NOT ENOUGH DATA

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split # ### <<< CHANGE: Not used, will do time-based split
from itertools import combinations
import copy

df 
# # --- Load Data ---
# try:
#     df = pd.read_csv(r'/dbfs/mnt/four18_s3/Other/1761568170.364943_year_hourly.csv')
# except FileNotFoundError:
#     print("File not found. Using dummy data for demonstration.")
#     # Create dummy data if file not found
#     dates = pd.date_range(start='2023-01-01', periods=8760, freq='H') # 1 year
#     data = {
#         'datetime_utc': dates,
#         'open': np.random.uniform(30000, 50000, 8760),
#         'high': lambda x: x['open'] + np.random.uniform(0, 1000, 8760),
#         'low': lambda x: x['open'] - np.random.uniform(0, 1000, 8760),
#         'close': lambda x: x['open'] + np.random.uniform(-500, 500, 8760),
#         'volume': np.random.uniform(10, 1000, 8760)
#     }
#     df = pd.DataFrame(data)
#     df['high'] = df.apply(lambda row: row['open'] + np.random.uniform(0, 1000), axis=1)
#     df['low'] = df.apply(lambda row: row['open'] - np.random.uniform(0, 1000), axis=1)
#     df['close'] = df.apply(lambda row: row['open'] + np.random.uniform(-500, 500), axis=1)


# Hold out the *actual* next hour you want to predict
true_close = df['close'].iloc[-1]
# The rest of the data is for training and validation
historical_df = df.iloc[:-1].copy()

# --- Feature engineering ---
def create_transformer_features(df, sentiment_df=None):
    # Ensure 'datetime_utc' exists, even for dummy data
    if 'datetime_utc' not in df.columns:
        df['datetime_utc'] = pd.date_range(start='2023-01-01', periods=len(df), freq='H')

    df = df.copy().sort_values('datetime_utc').reset_index(drop=True)
    df['ma7'] = df['close'].rolling(7).mean()
    df['ma3'] = df['close'].rolling(3).mean()
    
    # Consider adding time-based features
    df['hour'] = pd.to_datetime(df['datetime_utc']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['datetime_utc']).dt.dayofweek

    # Keep numeric OHLCV
    df_feat = df[['open','high','low','close','volume','hour','day_of_week']].copy()
    

    # # 2nd order interactions (optional)
    # for f1, f2 in combinations(df_feat.columns, 2):
    #     df_feat[f"{f1}_x_{f2}"] = df_feat[f1] * df_feat[f2]
    #     df_feat[f"{f1}_div_{f2}"] = np.where(df_feat[f2]!=0, df_feat[f1]/df_feat[f2], 0)

    # Other technical features
    df_feat['log_return'] = np.log(df_feat['close']/df_feat['close'].shift(1))
    df_feat['range'] = df_feat['high'] - df_feat['low']

    # Drop the first row that has NaN from log_return
    df_feat = df_feat.dropna().reset_index(drop=True)

    # Optional sentiment merge
    if sentiment_df is not None:
        sentiment_df = sentiment_df.copy()
        sentiment_df['date'] = pd.to_datetime(sentiment_df['datetime_utc']).dt.date
        # Need to align dates. Assuming df_feat needs its date column from the original df
        # This part is tricky without seeing the original df index alignment
        original_dates = pd.to_datetime(df['datetime_utc']).iloc[df_feat.index]
        df_feat['date'] = original_dates.dt.date
        df_feat = df_feat.merge(sentiment_df[['date','weighted_sentiment']], on='date', how='left')
        df_feat = df_feat.drop(columns=['date'])
        # Handle potential NaNs from merge
        df_feat['weighted_sentiment'] = df_feat['weighted_sentiment'].fillna(0)


    feature_columns = df_feat.columns.tolist()
    return df_feat, feature_columns

# --- Dataset ---
### <<< CHANGE: Refactored Dataset to accept pre-fit scalers
class TimeSeriesDataset(Dataset):
    def __init__(self, df_features, df_target, window_size, feature_scaler, target_scaler):
        self.window_size = window_size
        self.feature_scaler = feature_scaler
        self.target_scaler = target_scaler

        # Transform the data 
        self.data_scaled = self.feature_scaler.transform(df_features)
        self.target_scaled = self.target_scaler.transform(df_target)

    def __len__(self):
        # We need window_size history + 1 target
        return len(self.data_scaled) - self.window_size

    def __getitem__(self, idx):
        # Features
        x = self.data_scaled[idx:idx+self.window_size]

        # Target
        # We want the 'close' price at the end of the window
        y = self.target_scaled[idx+self.window_size]

        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


# --- Transformer model ---
class TimeSeriesTransformer(nn.Module):
    ### <<< CHANGE: Added window_size to init to make positional encoding dynamic
    def __init__(self, num_features, window_size, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1):
        super().__init__()
        self.input_fc = nn.Linear(num_features, d_model)

        # Positional Encoding (highly recommended for Transformers)
        # Using a simple learned embedding here, but sinusoidal is also common
        ### <<< CHANGE: num_embeddings is now window_size
        self.pos_encoder = nn.Embedding(window_size, d_model) # Assuming max seq_len < 100
        self.norm = nn.LayerNorm(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_fc = nn.Linear(d_model, 1)

        self.d_model = d_model

    def forward(self, x):
        # x: [batch, seq_len, features]

        # --- Add Positional Encoding ---
        # Create positions: [0, 1, 2, ..., seq_len-1]
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0) # [1, seq_len]
        pos_emb = self.pos_encoder(positions) # [1, seq_len, d_model]

        x = self.input_fc(x)                 # -> [batch, seq_len, d_model]
        x = x + pos_emb                      # Add positional encoding
        x = self.norm(x)  # Add normalization

        x = self.transformer_encoder(x)      # -> [batch, seq_len, d_model]

        # Get the representation of the *last* time step
        x = x[:, -1, :]                      # -> [batch, d_model]
        out = self.output_fc(x)              # -> [batch, 1]
        return out

# --- Training function ---
### <<< CHANGE: Added validation loop and best model saving
def train_transformer(train_df, val_df, feature_cols, target_col, window_size,
                      epochs=50, batch_size=64, lr=1e-3, device='cuda',d_model=64, num_layers=2):

    # --- 1. Fit Scalers ONLY on training data ---
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit scalers
    feature_scaler.fit(train_df[feature_cols])
    target_scaler.fit(train_df[[target_col]])

    # --- 2. Create Datasets and DataLoaders ---
    train_dataset = TimeSeriesDataset(
        df_features=train_df[feature_cols],
        df_target=train_df[[target_col]],
        window_size=window_size,
        feature_scaler=feature_scaler,
        target_scaler=target_scaler
    )

    val_dataset = TimeSeriesDataset(
        df_features=val_df[feature_cols],
        df_target=val_df[[target_col]],
        window_size=window_size,
        feature_scaler=feature_scaler,
        target_scaler=target_scaler
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # --- 3. Initialize Model, Loss, Optimizer ---
    ### <<< CHANGE: Pass window_size to the model
    model = TimeSeriesTransformer(num_features=len(feature_cols), window_size=window_size, d_model=d_model,num_layers=num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.SmoothL1Loss() #nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

    # --- 4. Training & Validation Loop ---
    best_val_loss = float('inf')
    best_model_state = None
    patience = 10
    counter = 0
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        train_loss = epoch_loss / len(train_loader)

        # --- Validation Loop ---
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item()

        val_loss = val_loss / len(val_loader)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            print(f"  -> New best model saved with Val Loss: {best_val_loss:.6f}")
            counter = 0
        else: counter+=1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    # Load the best model state
    model.load_state_dict(best_model_state)

    # Return the *best* model and the *fitted* scalers
    return model, feature_scaler, target_scaler

# --- Prediction function ---
### <<< CHANGE: Function now just takes the final data and scalers
def predict_next_hour(historical_data_df, model, feature_scaler, target_scaler, feature_cols, window_size=48, device='cuda'):
    model.eval()

    # Get the last 'window_size' rows from the *entire* historical dataset
    last_seq_df = historical_data_df[feature_cols].iloc[-window_size:]

    # Scale the sequence using the *already-fit* scaler
    last_seq_scaled = feature_scaler.transform(last_seq_df)

    # Convert to tensor and add batch dimension
    last_seq_tensor = torch.tensor(last_seq_scaled, dtype=torch.float32).unsqueeze(0).to(device) # [1, seq, features]

    with torch.no_grad():
        pred_scaled = model(last_seq_tensor) # [1, 1]

    # Inverse scale the prediction
    pred = target_scaler.inverse_transform(pred_scaled.cpu().numpy())[0][0]
    return pred

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import copy

# Assume 'df' is your fully loaded HOURLY DataFrame

### CHANGE ###
# New function to resample hourly data to daily and create features.
def create_daily_features(df):
    """
    Resamples hourly data to daily, creates a future target, and engineers daily features.
    """
    df = df.copy()
    df['datetime_utc'] = pd.to_datetime(df['datetime_utc'])
    df = df.set_index('datetime_utc').sort_index()

    # 1. Define aggregation rules for resampling
    agg_dict = {
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'weighted_sentiment': 'mean' # Daily average sentiment
    }
    # For all other interest rate columns, we take the last known value of the day
    for col in df.columns:
        if col not in agg_dict:
            agg_dict[col] = 'last'

    # 2. Resample the data to daily frequency
    daily_df = df.resample('D').agg(agg_dict)

    # 3. Handle missing days (e.g., weekends) by forward-filling data
    daily_df = daily_df.ffill()
    # Drop any remaining NaNs at the beginning
    daily_df = daily_df.dropna(how='all')

    # 4. Create the target variable: the next day's close price
    daily_df['target'] = daily_df['close'].shift(-1)

    # 5. Remove the last row since it has no target
    daily_df = daily_df.iloc[:-1]

    # 6. Engineer features on the new DAILY data
    daily_df['log_return'] = np.log(daily_df['close'] / daily_df['close'].shift(1))
    daily_df['range'] = daily_df['high'] - daily_df['low']
    daily_df['day_of_week'] = daily_df.index.dayofweek
    daily_df['month'] = daily_df.index.month

    # 7. Final cleanup
    daily_df = daily_df.dropna().reset_index() # dropna for log_return and reset index

    # 8. Define final feature and target columns
    target_col = 'target'
    # Features are all columns except the target and original merge_date
    feature_cols = daily_df.columns.drop([target_col, 'merge_date', 'datetime_utc']).tolist()
    
    return daily_df, feature_cols, target_col

### CHANGE ###
# The Dataset class is slightly adjusted for clarity with the new target column.
class TimeSeriesDataset(Dataset):
    def __init__(self, df, feature_cols, target_col, window_size, feature_scaler, target_scaler):
        self.window_size = window_size
        
        # Scale features and target
        feature_data_scaled = feature_scaler.transform(df[feature_cols])
        target_data_scaled = target_scaler.transform(df[[target_col]])
        
        # Combine into one array for easier indexing
        self.data = np.concatenate([feature_data_scaled, target_data_scaled], axis=1)
        self.n_features = feature_data_scaled.shape[1]

    def __len__(self):
        # We need window_size history + 1 target
        return len(self.data) - self.window_size

    def __getitem__(self, idx):
        # Features are the first n_features columns
        x = self.data[idx : idx + self.window_size, :self.n_features]
        # Target is the last column, at the end of the window period
        y = self.data[idx + self.window_size - 1, self.n_features:]
        
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Transformer Model Class (Unchanged) ---
class TimeSeriesTransformer(nn.Module):
    def __init__(self, num_features, window_size, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1):
        super().__init__()
        self.input_fc = nn.Linear(num_features, d_model)
        self.pos_encoder = nn.Embedding(window_size, d_model)
        self.norm = nn.LayerNorm(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_fc = nn.Linear(d_model, 1)
        self.d_model = d_model
    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0)
        pos_emb = self.pos_encoder(positions)
        x = self.input_fc(x) + pos_emb
        x = self.norm(x)
        x = self.transformer_encoder(x)
        x = x[:, -1, :]
        out = self.output_fc(x)
        return out

### CHANGE ###
# train_transformer is updated to use the new Dataset constructor
def train_transformer(train_df, val_df, feature_cols, target_col, window_size,
                      epochs=50, batch_size=32, lr=1e-4, device='cuda', d_model=64, num_layers=2):
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()
    feature_scaler.fit(train_df[feature_cols])
    target_scaler.fit(train_df[[target_col]])
    
    train_dataset = TimeSeriesDataset(
        df=train_df, feature_cols=feature_cols, target_col=target_col,
        window_size=window_size, feature_scaler=feature_scaler, target_scaler=target_scaler)
    val_dataset = TimeSeriesDataset(
        df=val_df, feature_cols=feature_cols, target_col=target_col,
        window_size=window_size, feature_scaler=feature_scaler, target_scaler=target_scaler)
        
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = TimeSeriesTransformer(num_features=len(feature_cols), window_size=window_size, d_model=d_model,num_layers=num_layers).to(device)
    # ... (rest of the training loop is unchanged) ...
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.SmoothL1Loss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
    best_val_loss = float('inf')
    best_model_state = None
    patience = 10
    counter = 0
    for epoch in range(epochs):
        model.train()
        # ... training steps ...
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        scheduler.step(val_loss)
        print(f"Epoch {epoch+1}/{epochs}, Val Loss: {val_loss:.6f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            counter = 0
        else:
            counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    model.load_state_dict(best_model_state)
    return model, feature_scaler, target_scaler

### CHANGE ###
# Renamed function for clarity
def predict_next_day(historical_data_df, model, feature_scaler, target_scaler, feature_cols, window_size, device='cuda'):
    model.eval()
    last_seq_df = historical_data_df[feature_cols].iloc[-window_size:]
    last_seq_scaled = feature_scaler.transform(last_seq_df)
    last_seq_tensor = torch.tensor(last_seq_scaled, dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        pred_scaled = model(last_seq_tensor)
    pred = target_scaler.inverse_transform(pred_scaled.cpu().numpy())[0][0]
    return pred


### CHANGE ###
# --- MAIN SCRIPT EXECUTION FLOW (EXAMPLE FOR DAILY PREDICTION) ---
if __name__ == '__main__':
    # Assume 'df' is your initial HOURLY dataframe

    # 1. Create daily aggregated data and features
    daily_processed_df, feature_cols, target_col = create_daily_features(df)
    
    print(f"Target column: {target_col}")
    print(f"Number of daily features: {len(feature_cols)}")

    # 2. Split daily data into training and validation sets
    train_size = int(len(daily_processed_df) * 0.8)
    train_df = daily_processed_df.iloc[:train_size]
    val_df = daily_processed_df.iloc[train_size:]
    print(f"Daily training data shape: {train_df.shape}")
    print(f"Daily validation data shape: {val_df.shape}")

    # 3. Define hyperparameters. Window size now refers to DAYS.
    # Using 30 days of history to predict the next day.
    WINDOW_SIZE = 7
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # 4. Run training
    best_model, feature_scaler, target_scaler = train_transformer(
        train_df=train_df, val_df=val_df,
        feature_cols=feature_cols, target_col=target_col,
        window_size=WINDOW_SIZE, device=DEVICE
    )
    
    # 5. Make a prediction for the next day
    predicted_close = predict_next_day(
        historical_data_df=daily_processed_df, # Use the full daily history
        model=best_model,
        feature_scaler=feature_scaler, target_scaler=target_scaler,
        feature_cols=feature_cols,
        window_size=WINDOW_SIZE, device=DEVICE
    )

    print("\n--- Prediction ---")
    print(f"Predicted Close Price for the NEXT DAY: {predicted_close:.2f}")

# Prediction

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

WINDOW_SIZE = 6 # Using hours to predict the next 1
TARGET_COL = 'close'

# 1. Feature Engineering
features_df, feature_cols, target_col = create_daily_features(df)

# 2. Split Data (Time-based)
# Use 80% for training, 20% for validation
split_idx = int(len(features_df) * 0.8)

train_df = features_df.iloc[:split_idx]
val_df = features_df.iloc[split_idx:]

print(f"Total historical samples: {len(features_df)}")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# 3. Train Model
model, f_scaler, t_scaler = train_transformer(
    train_df=train_df,
    val_df=val_df,
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    window_size=WINDOW_SIZE,
    epochs=50,
    batch_size=64,
    device=DEVICE,
    d_model=64,
    num_layers=2
)

# 4. Predict the *actual* next hour
# We use the *full* historical DF to get the most recent data
predicted_price = predict_next_day(
    historical_data_df=features_df, # Use all available history
    model=model,
    feature_scaler=f_scaler,
    target_scaler=t_scaler,
    feature_cols=feature_cols,
    window_size=WINDOW_SIZE,
    device=DEVICE
)
true_close = df['close'].iloc[-1]
print("\n--- Final Prediction ---")
print(f"Predicted next day price: {predicted_price:.2f}")
print(f"    Actual next day price: {true_close:.2f}")
print(f"                   Error: {predicted_price - true_close:.2f}")