In [7]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader

In [9]:
datasets_path = "./dataset/train.parquet/partitions/*.parquet" 
df_lazy = pl.scan_parquet(datasets_path)

In [5]:
print("Basic statistics for responder_6:")
stats = df_lazy.select([
    pl.col("responder_6").mean().alias("mean"),
    pl.col("responder_6").std().alias("std"),
    pl.col("responder_6").min().alias("min"),
    pl.col("responder_6").max().alias("max"),
    pl.col("responder_6").quantile(0.25).alias("25%"),
    pl.col("responder_6").quantile(0.50).alias("50%"),
    pl.col("responder_6").quantile(0.75).alias("75%")
]).collect()
print(stats)

# For correlations with target, we can calculate them one by one
# Let's get top correlated features
feature_cors = []
for feature in [f"feature_{i:02d}" for i in range(79)]:
    cor = df_lazy.select(
        pl.corr("responder_6", feature).alias("correlation")
    ).collect().item()
    feature_cors.append((feature, cor))

# Sort and print top correlations
feature_cors.sort(key=lambda x: abs(x[1]), reverse=True)
print("\nTop 10 correlated features with responder_6:")
for feature, cor in feature_cors[:10]:
    print(f"{feature}: {cor:.4f}")

# Get null counts
print("\nNull counts:")
null_counts = df_lazy.select([
    pl.col("*").null_count()
]).collect()
print(null_counts)

Basic statistics for responder_6:
shape: (1, 7)
┌───────────┬──────────┬──────┬─────┬───────────┬───────────┬──────────┐
│ mean      ┆ std      ┆ min  ┆ max ┆ 25%       ┆ 50%       ┆ 75%      │
│ ---       ┆ ---      ┆ ---  ┆ --- ┆ ---       ┆ ---       ┆ ---      │
│ f32       ┆ f32      ┆ f32  ┆ f32 ┆ f32       ┆ f32       ┆ f32      │
╞═══════════╪══════════╪══════╪═════╪═══════════╪═══════════╪══════════╡
│ -0.002141 ┆ 0.889852 ┆ -5.0 ┆ 5.0 ┆ -0.382761 ┆ -0.025566 ┆ 0.334321 │
└───────────┴──────────┴──────┴─────┴───────────┴───────────┴──────────┘

Top 10 correlated features with responder_6:
feature_06: -0.0467
feature_04: -0.0316
feature_07: -0.0302
feature_36: -0.0231
feature_60: 0.0189
feature_45: -0.0169
feature_56: -0.0165
feature_05: -0.0163
feature_51: 0.0152
feature_19: -0.0140

Null counts:
shape: (1, 92)
┌─────────┬─────────┬───────────┬────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐
│ date_id ┆ time_id ┆ symbol_id ┆ weight ┆ … ┆ responder_5 ┆ resp

In [11]:
class FinancialDataset(Dataset):
    def __init__(self, parquet_files, feature_cols=None):
        self.lazy_df = pl.scan_parquet(parquet_files)
        if feature_cols is None:
            self.feature_cols = [f'feature_{i:02d}' for i in range(79)]
        else:
            self.feature_cols = feature_cols
            
        self.length = self.lazy_df.select(pl.count()).collect().item()
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        row = self.lazy_df.slice(idx, 1).collect()
        
        features = row.select(self.feature_cols).to_numpy().flatten()
        target = row.select('responder_6').to_numpy().flatten()
        
        return torch.FloatTensor(features), torch.FloatTensor(target)

class FinancialNN(torch.nn.Module):
    def __init__(self, input_size=79):
        super().__init__()
        self.network = torch.nn.Sequential(
            torch.nn.Linear(input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.network(x)

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
                outputs = model(batch_features)
                val_loss += criterion(outputs, batch_targets).item()
                
        print(f'Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
    
    return model

In [None]:
dataset = FinancialDataset(datasets_path)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)

model = FinancialNN()
trained_model = train_model(model, train_loader, val_loader)

  self.length = self.lazy_df.select(pl.count()).collect().item()
