In [3]:
### Dataframes, linear algebra and visualization:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Sklearn:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

### Gradient Boosted CARTs:
import lightgbm as lgbm

#PyTorch 
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
import torch.nn.init as init

#SHAP
import shap

print("--- Setup complete ---")



--- Setup complete ---


In [4]:
### PyTorch dataloading class:
class DataloaderTorch(Dataset):
    def __init__(self, dataframe):
        super().__init__()

        # Convert data to a NumPy array and assign to self.data
        #self.data = dataframe.to_numpy()
        self.data = dataframe
        
    # Implement __len__ to return the number of data samples
    def __len__(self):
        return self.data.shape[0]
    
    #Here we are assuming the target will be the last column:
    def __getitem__(self, idx):
        features = self.data[idx, :-1]
        # Assign last data column to label
        label = self.data[idx, -1]
        return features, label

In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [6]:
### DNN architecture:

class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        # Linear Layers
        self.fc1 = nn.Linear(8, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 5)
        self.fc4 = nn.Linear(5,1)
        
        #Add batch normalization layers (improve vanishing gradients):
        self.bn1 = nn.BatchNorm1d(20)
        self.bn2 = nn.BatchNorm1d(10)
        self.bn3 = nn.BatchNorm1d(5)
        
        #He/Kaiming initialization (avoid vaninshing gradients)
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight)
        init.kaiming_uniform_(self.fc4.weight)
        
    def forward(self, x):
        # Pass x through linear layers adding ELU activations
        x = x.to(torch.float32) #Convert to torch-float32
        
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.functional.elu(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = nn.functional.elu(x)

        x = self.fc4(x)
        return x

In [7]:
### Importing the training dataset
df = pd.read_csv("/folder/xyz.csv")

In [8]:
df.head()
#df_reduced = df.iloc[0:1000, :]
df_reduced = df.copy()

### Reduce memory usage:

In [9]:
# Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # Provide memory optimization information if 'verbose' is True
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # Return the DataFrame with optimized memory usage
    return df

In [10]:
df_reduced = reduce_mem_usage(df_reduced)

## Exploratory Data Analysis 

In [11]:
print("Train Dataset has {} features".format(df.shape[1]-1))
print("Train Dataset has {} rows".format(df.shape[0]))

Train Dataset has 16 features
Train Dataset has 5237980 rows


In [12]:
### Number of different stock_ids:
print("{} different stocks".format(len(df['stock_id'].unique())))
print("{} different time_ids".format(len(df['time_id'].unique())))

200 different stocks
26455 different time_ids


In [13]:
df['time_id']

0              0
1              0
2              0
3              0
4              0
           ...  
5237975    26454
5237976    26454
5237977    26454
5237978    26454
5237979    26454
Name: time_id, Length: 5237980, dtype: int64

In [14]:
df.dtypes

stock_id                     int64
date_id                      int64
seconds_in_bucket            int64
imbalance_size             float64
imbalance_buy_sell_flag      int64
reference_price            float64
matched_size               float64
far_price                  float64
near_price                 float64
bid_price                  float64
bid_size                   float64
ask_price                  float64
ask_size                   float64
wap                        float64
target                     float64
time_id                      int64
row_id                      object
dtype: object

In [15]:
### Features with NaNs:
df.isna().sum().sort_values(ascending = False)

far_price                  2894342
near_price                 2857180
ask_price                      220
imbalance_size                 220
reference_price                220
matched_size                   220
wap                            220
bid_price                      220
target                          88
time_id                          0
ask_size                         0
stock_id                         0
bid_size                         0
date_id                          0
imbalance_buy_sell_flag          0
seconds_in_bucket                0
row_id                           0
dtype: int64

## Data preprocessing

In [17]:
### Pre-processing functions (1) - Dropping columns with high number of NaN and the ones that SHAP and LGBM feature importance
### detected as not important:
def drop_cols(dataframe):
    id_col = dataframe['row_id']
    df = dataframe.drop(['far_price', 'near_price', 'row_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'bid_size', 'reference_price'], axis = 1)
    return id_col, df


### Drop the categorical row_id variable, and far_price,and near_price that have too many NaN values:
#id_col, df_processed = drop_cols(df)
id_col, df_processed = drop_cols(df_reduced)

df_processed.isna().sum().sort_values(ascending = False)

imbalance_size    220
matched_size      220
bid_price         220
ask_price         220
wap               220
target             88
stock_id            0
date_id             0
ask_size            0
time_id             0
dtype: int64

In [18]:
### Pre-processing functions (2) - Imputing mean values:
def imputer(df_processed):
    '''
    Function that receives a dataframe and returns the mean/mode value of each missing value in a column partitioned by stock_id,
    '''
    stock_list = list(df_processed['stock_id'].unique())

    for stock in stock_list:
        stock_df = df_processed.loc[df_processed['stock_id'] == stock]
        imp_mean_imbalance_size = SimpleImputer(missing_values=np.nan, strategy='mean') #Instantiate SimpleImputer with mean strategy
#         imp_mean_reference_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_matched_size = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_bid_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_ask_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_wap = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_target = SimpleImputer(missing_values=np.nan, strategy='mean')
#         imp_mean_seconds_in_bucket = SimpleImputer(missing_values=np.nan, strategy='mean')
#         imp_mean_imbalance_buy_sell_flag = SimpleImputer(missing_values=np.nan, strategy='mode')
#         imp_mean_bid_size = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_ask_size = SimpleImputer(missing_values=np.nan, strategy='mean')

        df_processed['imbalance_size'] = imp_mean_imbalance_size.fit_transform(np.array(df_processed['imbalance_size']).reshape(-1,1))
        #df_processed['reference_price'] = imp_mean_reference_price.fit_transform(np.array(df_processed['reference_price']).reshape(-1,1))
        df_processed['matched_size'] = imp_mean_matched_size.fit_transform(np.array(df_processed['matched_size']).reshape(-1,1))
        df_processed['bid_price'] = imp_mean_bid_price.fit_transform(np.array(df_processed['bid_price']).reshape(-1,1))
        df_processed['ask_price'] = imp_mean_ask_price.fit_transform(np.array(df_processed['ask_price']).reshape(-1,1))
        df_processed['wap'] = imp_mean_wap.fit_transform(np.array(df_processed['wap']).reshape(-1,1))
        df_processed['target'] = imp_mean_target.fit_transform(np.array(df_processed['target']).reshape(-1,1))
        #df_processed['seconds_in_bucket'] = imp_mean_seconds_in_bucket.fit_transform(np.array(df_processed['seconds_in_bucket']).reshape(-1,1))
        #df_processed['bid_size'] = imp_mean_bid_size.fit_transform(np.array(df_processed['bid_size']).reshape(-1,1))
        df_processed['ask_size'] = imp_mean_ask_size.fit_transform(np.array(df_processed['ask_size']).reshape(-1,1))


        return df_processed, imp_mean_imbalance_size, imp_mean_matched_size, imp_mean_bid_price, \
imp_mean_ask_price, imp_mean_wap, imp_mean_ask_size

df_processed, imp_mean_imbalance_size, imp_mean_matched_size, imp_mean_bid_price, imp_mean_ask_price, imp_mean_wap, imp_mean_ask_size = imputer(df_processed)

In [19]:
df_processed.isna().sum()

stock_id          0
date_id           0
imbalance_size    0
matched_size      0
bid_price         0
ask_price         0
ask_size          0
wap               0
target            0
time_id           0
dtype: int64

In [20]:
### Function that applies the imputer to a dataframe (to be used in the testdataset):
def apply_imputer(df_processed):
    df_processed['imbalance_size'] = imp_mean_imbalance_size.transform(np.array(df_processed['imbalance_size']).reshape(-1,1))
    #df_processed['reference_price'] = imp_mean_reference_price.transform(np.array(df_processed['reference_price']).reshape(-1,1))
    df_processed['matched_size'] = imp_mean_matched_size.transform(np.array(df_processed['matched_size']).reshape(-1,1))
    df_processed['bid_price'] = imp_mean_bid_price.transform(np.array(df_processed['bid_price']).reshape(-1,1))
    df_processed['ask_price'] = imp_mean_ask_price.transform(np.array(df_processed['ask_price']).reshape(-1,1))
    df_processed['wap'] = imp_mean_wap.transform(np.array(df_processed['wap']).reshape(-1,1))
    #df_processed['seconds_in_bucket'] = imp_mean_seconds_in_bucket.transform(np.array(df_processed['seconds_in_bucket']).reshape(-1,1))
    #df_processed['bid_size'] = imp_mean_bid_size.transform(np.array(df_processed['bid_size']).reshape(-1,1))
    df_processed['ask_size'] = imp_mean_ask_size.transform(np.array(df_processed['ask_size']).reshape(-1,1))
    
    return df_processed

In [1]:
### Test dataset:
test_df = pd.read_csv("/test/testset.csv")
test_df_targets = pd.read_csv("/test/testtargets.csv")

test_df

### LGBM model

In [23]:
### LGBM:

#First just performing a simple train_test_split on the training dataset:

y = df_processed['target']
X = df_processed.drop(['target'], axis = 1)

#Final test dataset doesn't have time_id:
X = X.drop(['time_id'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 21)
X_train = X.iloc[0:5000000,:]
X_test = X.iloc[5000000:,:]
y_train = y[0:5000000]
y_test = y[5000000:]

In [24]:
# defining parameters for Light GBM
# params = {
#     'task': 'train', 
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'num_leaves': 20,
#     'n_estimators': 1000,
#     'learning_rate': 0.05,
#     'metric': {'l2','l1'},
#     'verbose': -1
# }

params = {
    "objective": "mae",
    "n_estimators": 1000,
    "num_leaves": 256,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "learning_rate": 0.001,
    "n_jobs": 4,
    "device": "gpu",
    "verbosity": -1,
    "importance_type": "gain",
}

In [25]:
# Loading data
lgbm_train = lgbm.Dataset(X_train, y_train)
lgbm_eval = lgbm.Dataset(X_test, y_test, reference=lgbm_train)

In [26]:
# fitting the model
clf = lgbm.train(params,
                 train_set=lgbm_train,
                 valid_sets=lgbm_eval,
                 early_stopping_rounds=30)

Found `n_estimators` in params. Will use it instead of argument
'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


[1]	valid_0's l1: 5.80703
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l1: 5.80648
[3]	valid_0's l1: 5.80604
[4]	valid_0's l1: 5.8055
[5]	valid_0's l1: 5.8048
[6]	valid_0's l1: 5.80426
[7]	valid_0's l1: 5.80387
[8]	valid_0's l1: 5.80356
[9]	valid_0's l1: 5.80304
[10]	valid_0's l1: 5.80274
[11]	valid_0's l1: 5.80207
[12]	valid_0's l1: 5.80158
[13]	valid_0's l1: 5.8011
[14]	valid_0's l1: 5.80075
[15]	valid_0's l1: 5.80028
[16]	valid_0's l1: 5.79966
[17]	valid_0's l1: 5.79906
[18]	valid_0's l1: 5.79873
[19]	valid_0's l1: 5.79812
[20]	valid_0's l1: 5.79783
[21]	valid_0's l1: 5.7974
[22]	valid_0's l1: 5.79697
[23]	valid_0's l1: 5.79666
[24]	valid_0's l1: 5.79607
[25]	valid_0's l1: 5.79584
[26]	valid_0's l1: 5.79527
[27]	valid_0's l1: 5.79483
[28]	valid_0's l1: 5.79453
[29]	valid_0's l1: 5.79401
[30]	valid_0's l1: 5.79381
[31]	valid_0's l1: 5.79342
[32]	valid_0's l1: 5.79322
[33]	valid_0's l1: 5.79285
[34]	valid_0's l1: 5.79248
[35]	valid_0's l1: 5.7921
[36]	val

In [27]:
y_pred = clf.predict(X_test)
mae = mean_absolute_error(y_pred, y_test)
print("Mean Absolute Error: {}".format(mae))

Mean Absolute Error: 5.756058880770116


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [3]:
#Feature importance:
lgbm.plot_importance(clf, height=.5)

### Now let's test our baseline model in the sample test dataset:

In [2]:
### Drop far_price, and near_price as we did in the traning dataset:

test_id_col, test_df_processed = drop_cols(test_df)
test_df_processed

In [32]:
test_df_processed = test_df_processed.drop(['time_id'], axis = 1)
test_df_processed = test_df_processed.drop(['currently_scored'], axis = 1)

In [34]:
### Use our trained model to make predictions:
y_sample_test = clf.predict(test_df_processed)
y_sample_targets = test_df_targets['revealed_target']
y_sample_targets = y_sample_targets.dropna()

mae_test = mean_absolute_error(y_sample_test, y_sample_targets)
print("MAE in the test Dataset: {}".format(mae_test))

MAE in the test Dataset: 5.439314986918139


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


## Deep Learning PyTorch Model:

In [35]:
#time_id not used in the model training
df_processed_torch = df_processed.drop(['time_id'], axis = 1)

In [36]:
## Scale the dataset
scaler = StandardScaler()
df_targets = df_processed_torch.iloc[:,-1]
df_processed_scaled = scaler.fit_transform(df_processed_torch.iloc[:, :-1])
df_processed_scaled.shape

df_scaled_with_targets = pd.concat([pd.DataFrame(df_processed_scaled), df_targets], axis = 1)
df_scaled_with_targets = df_scaled_with_targets.to_numpy()

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [37]:
### Split into the train and test (validation) sets:

df_train = df_scaled_with_targets[0:5000000, :]
df_train

df_test = df_scaled_with_targets[5000000:,:]
df_test

array([[-1.3700737e+00,  1.5699632e+00, -2.5155580e-01, ...,
        -1.0170183e-01, -3.9853221e-03, -7.3897839e+00],
       [-1.3527941e+00,  1.5699632e+00, -1.7984700e-01, ...,
        -2.9960307e-01, -6.8548304e-01,  7.8904629e+00],
       [-1.3355144e+00,  1.5699632e+00,  1.7912329e+00, ...,
        -2.1087390e-01, -2.3462774e-01, -1.1998415e+00],
       ...,
       [ 1.6884135e+00,  1.7215528e+00, -2.7858451e-01, ...,
         9.7763687e-01, -1.6796988e+00,  1.1694431e+00],
       [ 1.7056930e+00,  1.7215528e+00, -2.2979701e-01, ...,
         4.7645264e+00, -3.9398196e-01, -1.5401840e+00],
       [ 1.7229726e+00,  1.7215528e+00, -1.8673745e-01, ...,
         1.9063126e+00,  9.1376334e-01, -6.5302849e+00]], dtype=float32)

In [38]:
# Create an instance of the dataloader
dataset_train = DataloaderTorch(df_train)

# Create a DataLoader based on dataset_train
dataloader_train = DataLoader(
    dataset_train,
    batch_size=2048,
    shuffle=False,
)

# Create an instance of the dataloader
dataset_test = DataloaderTorch(df_test)

# Create a DataLoader based on dataset_train
dataloader_test = DataLoader(
    dataset_test,
    batch_size=2048,
    shuffle=False,
)

In [39]:
### Train and Test loop funtions:

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X.to(device))
        #y = y.reshape(-1, 1)
        loss = loss_fn(pred, y.to(device))

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X.to(device))
            test_loss += loss_fn(pred, y.to(device)).item()
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [40]:
dnn = DNN().to(device)
print(dnn)

DNN(
  (fc1): Linear(in_features=8, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (fc4): Linear(in_features=5, out_features=1, bias=True)
  (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [41]:
# Define the Adam optimizer
optimizer = optim.Adam(dnn.parameters(), lr=0.001)

# Initialize the loss function (Mean Absolute Error)
loss_fn = nn.L1Loss()

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(dataloader_train, dnn, loss_fn, optimizer)
    test_loop(dataloader_test, dnn, loss_fn)
print("Done!")

Epoch 1
-------------------------------


Using a target size (torch.Size([2048])) that is different to the input size (torch.Size([2048, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


loss: 6.262473  [ 2048/5000000]
loss: 5.874525  [206848/5000000]
loss: 4.005534  [411648/5000000]
loss: 4.466842  [616448/5000000]
loss: 5.301092  [821248/5000000]
loss: 13.438891  [1026048/5000000]
loss: 6.625601  [1230848/5000000]
loss: 7.118485  [1435648/5000000]
loss: 6.019901  [1640448/5000000]
loss: 6.847053  [1845248/5000000]
loss: 10.098598  [2050048/5000000]
loss: 6.643748  [2254848/5000000]
loss: 5.960042  [2459648/5000000]
loss: 6.004039  [2664448/5000000]
loss: 6.497533  [2869248/5000000]
loss: 5.621437  [3074048/5000000]
loss: 5.153612  [3278848/5000000]
loss: 8.215820  [3483648/5000000]
loss: 6.032023  [3688448/5000000]
loss: 5.349797  [3893248/5000000]
loss: 7.333556  [4098048/5000000]
loss: 4.903646  [4302848/5000000]
loss: 7.580214  [4507648/5000000]
loss: 5.280579  [4712448/5000000]
loss: 4.945852  [4917248/5000000]


Using a target size (torch.Size([832])) that is different to the input size (torch.Size([832, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
Using a target size (torch.Size([412])) that is different to the input size (torch.Size([412, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


Test Error: 
 Accuracy: 0.0%, Avg loss: 5.797584 

Epoch 2
-------------------------------
loss: 6.129678  [ 2048/5000000]
loss: 5.884496  [206848/5000000]
loss: 4.006896  [411648/5000000]
loss: 4.465058  [616448/5000000]
loss: 5.300643  [821248/5000000]
loss: 13.438490  [1026048/5000000]
loss: 6.624300  [1230848/5000000]
loss: 7.118714  [1435648/5000000]
loss: 6.019745  [1640448/5000000]
loss: 6.847759  [1845248/5000000]
loss: 10.098497  [2050048/5000000]
loss: 6.643152  [2254848/5000000]
loss: 5.959695  [2459648/5000000]
loss: 6.003633  [2664448/5000000]
loss: 6.497471  [2869248/5000000]
loss: 5.620371  [3074048/5000000]
loss: 5.153567  [3278848/5000000]
loss: 8.215700  [3483648/5000000]
loss: 6.032026  [3688448/5000000]
loss: 5.349725  [3893248/5000000]
loss: 7.333776  [4098048/5000000]
loss: 4.903453  [4302848/5000000]
loss: 7.580251  [4507648/5000000]
loss: 5.280387  [4712448/5000000]
loss: 4.945765  [4917248/5000000]
Test Error: 
 Accuracy: 0.0%, Avg loss: 5.797357 

Epoch 3
----

### Evaluate the DNN model in the test dataset

In [42]:
### Test dataset targets

y_sample_targets = test_df_targets['revealed_target']
y_sample_targets = y_sample_targets.dropna()

In [43]:
# Convert to 2D PyTorch tensors
X_final_test_torch = torch.tensor(scaler.transform(test_df_processed), dtype=torch.float32)
X_final_test_torch = X_final_test_torch.to(device)

y_sample_targets = test_df_targets['revealed_target']
y_sample_targets = y_sample_targets.dropna()
y_final_test_torch = torch.tensor(y_sample_targets.to_numpy(), dtype=torch.float32)
y_final_test_torch = y_final_test_torch.to(device)

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [44]:
X_final_test_torch.shape

torch.Size([33000, 8])

In [45]:
y_final_test_pred = dnn(X_final_test_torch)

In [46]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [47]:
final_loss = loss_fn(y_final_test_pred.cpu(), y_final_test_torch.cpu())

Using a target size (torch.Size([33000])) that is different to the input size (torch.Size([33000, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


In [48]:
final_loss

tensor(5.3912, grad_fn=<MeanBackward0>)