In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
str1 = '../input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id='
str2 = '/part-0.parquet'
file_paths = [f"{str1}{i}{str2}" for i in range(4)]

In [None]:
dataframes = [pd.read_parquet(file) for file in file_paths[-4:]]
train = pd.concat(dataframes)

train.head()

In [None]:
class FinancialDataset(Dataset):
    def __init__(self, df):
        columns_to_drop = ['feature_21', 'feature_26', 'feature_27', 'feature_31']
        df = df.drop(columns=columns_to_drop)
        
        self.feature_cols = ([f'feature_{i:02d}' for i in range(79) if f'feature_{i:02d}' not in columns_to_drop] + 
                           ['responder_0', 'responder_1', 'responder_2', 'responder_3', 
                            'responder_4', 'responder_5', 'responder_7', 'responder_8'])
        
        for col in self.feature_cols:
            median_val = df[col].median()
            
            if df[col].isna().any():
                df[col] = df[col].fillna(median_val if not pd.isna(median_val) else 0)
        
        self.features = df[self.feature_cols].values
        self.targets = df['responder_6'].values
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.FloatTensor([self.targets[idx]])
        
class FinancialNN(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.network = torch.nn.Sequential(
            torch.nn.BatchNorm1d(input_size),
            torch.nn.Linear(input_size, 512),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(512),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(256),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(128),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 1)
        )
        
    def forward(self, x):
        return self.network(x)

In [None]:
def train_model(model, train_loader, val_loader, epochs=10, lr=0.0005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model = model.to(device)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
            
        model.eval()
        val_loss = 0
        predictions = []
        actuals = []
        
        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
                outputs = model(batch_features)
                val_loss += criterion(outputs, batch_targets).item()
                
                predictions.extend(outputs.cpu().numpy())
                actuals.extend(batch_targets.cpu().numpy())
        
        train_loss = train_loss / len(train_loader)
        val_loss = val_loss / len(val_loader)
        
        predictions = np.array(predictions)
        actuals = np.array(actuals)
        r2 = 1 - np.sum((predictions - actuals) ** 2) / np.sum((actuals - actuals.mean()) ** 2)
        
        print(f'Epoch {epoch+1}:')
        print(f'  Train Loss: {train_loss:.6f}')
        print(f'  Val Loss: {val_loss:.6f}')
        print(f'  R2 Score: {r2:.6f}')
        
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

In [None]:
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

train_dataset = FinancialDataset(train_df)
val_dataset = FinancialDataset(val_df)

input_size = len(train_dataset.feature_cols)
model = FinancialNN(input_size)


train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)

trained_model = train_model(model, train_loader, val_loader, epochs=15, lr=0.0005)