# American Trading Automatic Tensorizer (AT-AT)

This project is adapting the paper "MASTER: Market-Guided Stock Transformer for Stock Price Forecasting" into a suitable model for American stocks and stock formatting.  


# 1. Imports and Environment Setup

This section is where the imports for each library are like TensorFlow, PyTorch, and stock data retrieval libraries like “yfinance” for Fortune 200 data.

In [44]:
#################### IMPORTS ####################

# Imports for MASTER model reimplementation
import torch
from torch import nn
from torch.nn.modules.linear import Linear
from torch.nn.modules.dropout import Dropout
from torch.nn.modules.normalization import LayerNorm
import math

# Imports for Base Model files
import numpy as np
import pandas as pd
import copy
from torch.utils.data import DataLoader
from torch.utils.data import Sampler
import torch.optim as optim

# Imports for Fortune 200 Datasets
import yfinance as yf

# Imports for additional libraries for reimplementation
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
import os
from sklearn.model_selection import train_test_split
import warnings
import re

# 2.  Dataset Initialization

This is where the data imports, processing, and repackaging for use in AI training and testing is done.


In [45]:
#################### DATASET CREATION ####################

# Shortlist of Fortune 500 company tickers
tickers = ["WMT", "AMZN", "AAPL", "UNH", "CVS", "XOM", "GOOGL", "MCK",
           "COR", "COST"]

# Preparing CSV file to be later referenced in main
universe = 'Fortune200'
save_path = f'/content/data/{universe}'
os.makedirs(save_path, exist_ok=True)
file_name = f'{save_path}/fortune200_data.csv'

batch_size = 10
ticker_batches = [tickers[i:i + batch_size] for i in range(0, len(tickers), batch_size)]
start_date = "2014-01-01"
end_date = "2024-01-01"
lookback_window = 8  # T=8

def process_batch(batch_tickers):
    # Download data for the current batch of tickers only
    data = yf.download(batch_tickers, start=start_date, end=end_date)

    # Select relevant columns: Adj Close and Volume
    data_adj_close = data['Adj Close']
    data_volume = data['Volume']
    data_features = pd.concat([data_adj_close, data_volume], axis=1, keys=['Adj Close', 'Volume'])
    data_features = data_features.ffill().bfill().dropna(how="all")

    # Stack the data, then filter for 'Adj Close' and 'Volume'
    adj_close_data = data_features['Adj Close'].stack().reset_index(name='Value')
    adj_close_data['Feature'] = 'Adj Close'

    volume_data = data_features['Volume'].stack().reset_index(name='Value')
    volume_data['Feature'] = 'Volume'

    # Concatenate the separate dataframes for Adj Close and Volume
    stacked_data = pd.concat([adj_close_data, volume_data], ignore_index=True)

    # Reorder columns to ensure consistent order
    stacked_data = stacked_data[['Date', 'Ticker', 'Feature', 'Value']]

    adj_close_df = stacked_data[stacked_data['Feature'] == 'Adj Close'].pivot(index=['Date', 'Ticker'], columns='Feature', values='Value').reset_index()
    volume_df = stacked_data[stacked_data['Feature'] == 'Volume'].pivot(index=['Date', 'Ticker'], columns='Feature', values='Value').reset_index()
    data_features = pd.merge(adj_close_df, volume_df, on=['Date', 'Ticker'], how='outer')

    # Fill missing data forward and backward to handle missing dates
    data_features = data_features.ffill().bfill().fillna(0)
    data_features['Date'] = pd.to_datetime(data_features['Date']).dt.tz_localize(None)

    # Initial save to CSV file to be added on to
    file_name = f'data/{universe}/fortune200_data.csv'
    data_features.to_csv(file_name, mode='a', header=not os.path.exists(file_name), index=False)

    # Prepare (N, T, F) data structure
    max_date = data_features['Date'].max()
    date_range = pd.date_range(end=max_date, periods=lookback_window, freq='B')
    stock_data = []

    # Separate data processing for each ticker
    for ticker, group_data in data_features.groupby('Ticker'):
        # Reset index to avoid type mismatch issues during reindexing
        group_data = group_data.set_index('Date').reindex(date_range, method='ffill').reset_index()
        ticker_data = group_data[['Adj Close', 'Volume']].values
        stock_data.append(ticker_data)

    # Convert stock data list to numpy array
    dataset = np.array(stock_data)

    # Normalize data per day across stocks
    normalized_data = []
    for daily_data in dataset:
        # Reshape daily_data to (N * T, F) for StandardScaler
        reshaped_data = daily_data.reshape(-1, daily_data.shape[-1])
        scaler = StandardScaler()

        # Fit and transform the data, replacing NaNs or infs post-scaling if any remain
        normalized_daily_data = scaler.fit_transform(reshaped_data)
        normalized_data.append(normalized_daily_data.reshape(daily_data.shape))

    return np.array(normalized_data)

# Process each batch and save to CSV
all_normalized_data = []
for batch_num, batch_tickers in enumerate(ticker_batches):
    # Process each batch of tickers
    batch_normalized_data = process_batch(batch_tickers)
    all_normalized_data.append(batch_normalized_data)

    # Flatten the 3D array to save as 2D in CSV
    flat_data = batch_normalized_data.reshape(-1, batch_normalized_data.shape[-1])

# Combine all batch data if needed
full_data = np.concatenate(all_normalized_data, axis=0)
print("All data processed and saved to CSV.")

#################### DATASET SPLITTING ####################

# Load the complete dataset
dtype_dict = {
    "Date": "str",
    "Ticker": "str"
}
data_path = f'/content/data/{universe}/fortune200_data.csv'
df = pd.read_csv(data_path, dtype=dtype_dict, low_memory=False)

# Ensure the output directory exists
output_dir = f'/content/data/{universe}'
os.makedirs(output_dir, exist_ok=True)

# Split data into train, validation, and test sets
# Adjust random_state to ensure reproducibility if needed
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save each split to CSV
train_df.to_csv(f'{output_dir}/{universe}_dl_train.csv', index=False)
valid_df.to_csv(f'{output_dir}/{universe}_dl_valid.csv', index=False)
test_df.to_csv(f'{output_dir}/{universe}_dl_test.csv', index=False)

print("Data split into train, validation, and test sets and saved to CSVs.")

[*********************100%***********************]  10 of 10 completed


All data processed and saved to CSV.
Data split into train, validation, and test sets and saved to CSVs.


# 3. Model Reimplementation

For the purposes of project requirements, this assignment has been reimplemented by way of minor algorithmic additions for added functionality. More prominently, most of the code has been changed to achieve the same objective, but in a manner that's different enough to differentiate this model from just copying MASTER itself.



# 3a. Base Model

This section of the model includes the algorithms within base_model.py. These are classes integral to the functionality of the model, including batch samplers, prediction calculators, epoch trainers, and more.

In [46]:
#################### BASE MODEL ####################

def calc_ic(pred, label):
    dataframe = pd.DataFrame({'pred':pred, 'label':label})
    ic = dataframe['pred'].corr(dataframe['label'])
    r_ic = dataframe['pred'].corr(dataframe['label'], method='spearman')
    return ic, r_ic

# HAS BEEN REIMPLEMENTED TO WORK WITH CSV FILES
class DailyBatchSamplerRandom(Sampler):
    def __init__(self, data_source, batch_size=10, shuffle=False):
        self.data_source = data_source
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_samples = len(data_source)

    def __iter__(self):
        indices = np.arange(self.num_samples)
        if self.shuffle:
            np.random.shuffle(indices)

        # Yield batches based on batch size
        for start in range(0, self.num_samples, self.batch_size):
            yield indices[start:start + self.batch_size]

    def __len__(self):
        return (self.num_samples + self.batch_size - 1) // self.batch_size

class SequenceModel():
    def __init__(self, n_epochs, lr, GPU=None, seed=None, train_stop_loss_thred=None, save_path = 'model/', save_prefix= ''):
        self.n_epochs = n_epochs
        self.lr = lr
        self.device = torch.device(f"cuda:{GPU}" if torch.cuda.is_available() else "cpu")
        self.seed = seed
        self.train_stop_loss_thred = train_stop_loss_thred

        if self.seed is not None:
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
        self.fitted = False

        self.model = None
        self.train_optimizer = None

        self.save_path = save_path
        self.save_prefix = save_prefix


    def init_model(self):
        if self.model is None:
            raise ValueError("model has not been initialized")

        self.train_optimizer = optim.Adam(self.model.parameters(), self.lr)
        self.model.to(self.device)

    def loss_fn(self, pred, label):
        mask = ~torch.isnan(label)
        loss = (pred[mask]-label[mask])**2
        return torch.mean(loss)

    # HAS BEEN REIMPLEMENTED TO WORK WITH CSV FILES
    def train_epoch(self, data_loader):
        self.model.train()
        losses = []

        for data in data_loader:
            data = torch.tensor(data, dtype=torch.float32).to(self.device)

            feature = data[:, :, :-1]
            label = data[:, -1, -1]

            pred = self.model(feature)
            loss = self.loss_fn(pred, label)
            losses.append(loss.item())

            self.train_optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0)
            self.train_optimizer.step()

        return float(np.mean(losses))

    # HAS BEEN REIMPLEMENTED TO WORK WITH CSV FILES
    def test_epoch(self, data_loader):
        self.model.eval()
        losses = []

        for data in data_loader:
            # Convert each batch to a Torch tensor
            data = torch.tensor(data, dtype=torch.float32).to(self.device)

            # Separate features and labels
            feature = data[:, :, :-1]  # All columns except the last one are features
            label = data[:, -1, -1]    # Last column is the label

            # Forward pass
            with torch.no_grad():
                pred = self.model(feature)
            loss = self.loss_fn(pred, label)
            losses.append(loss.item())

        return float(np.mean(losses))

    # HAS BEEN REIMPLEMENTED TO WORK WITH CSV FILES
    def _init_data_loader(self, data, shuffle=True, drop_last=True):
        sampler = DailyBatchSamplerRandom(data, batch_size=32, shuffle=shuffle)
        return DataLoader(data.values, sampler=sampler, drop_last=drop_last)

    def load_param(self, param_path):
        self.model.load_state_dict(torch.load(param_path, map_location=self.device))
        self.fitted = True

    # HAS BEEN REIMPLEMENTED TO WORK WITH CSV FILES
    def fit(self, dl_train, dl_valid):
        # Initiate loaders without the 'Date' column for model training
        train_loader = self._init_data_loader(dl_train, shuffle=True, drop_last=True)
        valid_loader = self._init_data_loader(dl_valid, shuffle=False, drop_last=True)

        # Continue with the rest of the training process
        self.fitted = True
        best_param = None
        for step in range(self.n_epochs):
            train_loss = self.train_epoch(train_loader)
            val_loss = self.test_epoch(valid_loader)
            print("Epoch %d, train_loss %.6f, valid_loss %.6f " % (step, train_loss, val_loss))
            best_param = copy.deepcopy(self.model.state_dict())
            if train_loss <= self.train_stop_loss_thred:
                break

        # Ensure the directory exists before saving the file
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        # Save best model parameters to CSV
        df_params = pd.DataFrame.from_dict({k: v.cpu().numpy().flatten() for k, v in best_param.items()}, orient='index')
        df_params.to_csv(f'{self.save_path}{self.save_prefix}_master_params.csv')

    # HAS BEEN REIMPLEMENTED
    def predict(self, dl_test_tensor):
        if not self.fitted:
            raise ValueError("model is not fitted yet!")

        test_loader = DataLoader(dl_test_tensor, batch_size=32, shuffle=False, drop_last=False)  # Adjust batch size as needed

        preds = []
        labels = []

        self.model.eval()
        for data in test_loader:
            data = data.to(self.device)
            feature = data[:, :, :-1]
            label = data[:, -1, -1]
            with torch.no_grad():
                pred = self.model(feature).cpu().numpy().ravel()
            preds.append(pred)
            labels.append(label.cpu().numpy())

        predictions = np.concatenate(preds)
        labels = np.concatenate(labels)

        daily_ic, daily_ric = calc_ic(predictions, labels)

        metrics = {
            'IC': daily_ic,
            'ICIR': daily_ic / np.std(predictions) if np.std(predictions) != 0 else 0,
            'RIC': daily_ric,
            'RICIR': daily_ric / np.std(predictions) if np.std(predictions) != 0 else 0
        }

        return pd.Series(predictions, index=dl_test_data['Date'][:len(predictions)]), metrics

# 3b. MASTER Model

This section is for all of the classes within MASTER - including positional encoding, head attention, parent and classes for MASTER and combining subclasses, and more.

In [47]:
#################### MASTER MODEL ####################

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=3000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:x.shape[1], :]


class SAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout):
        super().__init__()

        self.d_model = d_model
        self.nhead = nhead
        self.temperature = math.sqrt(self.d_model/nhead)

        self.qtrans = nn.Linear(d_model, d_model, bias=False)
        self.ktrans = nn.Linear(d_model, d_model, bias=False)
        self.vtrans = nn.Linear(d_model, d_model, bias=False)

        attn_dropout_layer = []
        for i in range(nhead):
            attn_dropout_layer.append(Dropout(p=dropout))
        self.attn_dropout = nn.ModuleList(attn_dropout_layer)

        # input LayerNorm
        self.norm1 = LayerNorm(d_model, eps=1e-5)

        # FFN layerNorm
        self.norm2 = LayerNorm(d_model, eps=1e-5)
        self.ffn = nn.Sequential(
            Linear(d_model, d_model),
            nn.ReLU(),
            Dropout(p=dropout),
            Linear(d_model, d_model),
            Dropout(p=dropout)
        )

    def forward(self, x):
        x = self.norm1(x)
        q = self.qtrans(x).transpose(0,1)
        k = self.ktrans(x).transpose(0,1)
        v = self.vtrans(x).transpose(0,1)

        dim = int(self.d_model/self.nhead)
        att_output = []
        for i in range(self.nhead):
            if i==self.nhead-1:
                qh = q[:, :, i * dim:]
                kh = k[:, :, i * dim:]
                vh = v[:, :, i * dim:]
            else:
                qh = q[:, :, i * dim:(i + 1) * dim]
                kh = k[:, :, i * dim:(i + 1) * dim]
                vh = v[:, :, i * dim:(i + 1) * dim]

            atten_ave_matrixh = torch.softmax(torch.matmul(qh, kh.transpose(1, 2)) / self.temperature, dim=-1)
            if self.attn_dropout:
                atten_ave_matrixh = self.attn_dropout[i](atten_ave_matrixh)
            att_output.append(torch.matmul(atten_ave_matrixh, vh).transpose(0, 1))
        att_output = torch.concat(att_output, dim=-1)

        # FFN
        xt = x + att_output
        xt = self.norm2(xt)
        att_output = xt + self.ffn(xt)

        return att_output


class TAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.qtrans = nn.Linear(d_model, d_model, bias=False)
        self.ktrans = nn.Linear(d_model, d_model, bias=False)
        self.vtrans = nn.Linear(d_model, d_model, bias=False)

        self.attn_dropout = []
        if dropout > 0:
            for i in range(nhead):
                self.attn_dropout.append(Dropout(p=dropout))
            self.attn_dropout = nn.ModuleList(self.attn_dropout)

        # input LayerNorm
        self.norm1 = LayerNorm(d_model, eps=1e-5)
        # FFN layerNorm
        self.norm2 = LayerNorm(d_model, eps=1e-5)
        # FFN
        self.ffn = nn.Sequential(
            Linear(d_model, d_model),
            nn.ReLU(),
            Dropout(p=dropout),
            Linear(d_model, d_model),
            Dropout(p=dropout)
        )

    def forward(self, x):
        x = self.norm1(x)
        q = self.qtrans(x)
        k = self.ktrans(x)
        v = self.vtrans(x)

        dim = int(self.d_model / self.nhead)
        att_output = []
        for i in range(self.nhead):
            if i==self.nhead-1:
                qh = q[:, :, i * dim:]
                kh = k[:, :, i * dim:]
                vh = v[:, :, i * dim:]
            else:
                qh = q[:, :, i * dim:(i + 1) * dim]
                kh = k[:, :, i * dim:(i + 1) * dim]
                vh = v[:, :, i * dim:(i + 1) * dim]
            atten_ave_matrixh = torch.softmax(torch.matmul(qh, kh.transpose(1, 2)), dim=-1)
            if self.attn_dropout:
                atten_ave_matrixh = self.attn_dropout[i](atten_ave_matrixh)
            att_output.append(torch.matmul(atten_ave_matrixh, vh))
        att_output = torch.concat(att_output, dim=-1)

        # FFN
        xt = x + att_output
        xt = self.norm2(xt)
        att_output = xt + self.ffn(xt)

        return att_output

# HAS BEEN REIMPLEMENTED
class Gate(nn.Module):
    def __init__(self, d_input, d_output, beta=1.0):
        super(Gate, self).__init__()
        self.trans = nn.Linear(d_input, d_output)  # Ensure input and output match
        self.d_output = d_output
        self.t = beta

    def forward(self, gate_input):
        # Retain gate_input shape as (batch_size, 1, d_input) before Linear
        output = self.trans(gate_input)
        output = torch.softmax(output / self.t, dim=-1)
        return output  # Returning as-is

class TemporalAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.trans = nn.Linear(d_model, d_model, bias=False)

    def forward(self, z):
        h = self.trans(z) # [N, T, D]
        query = h[:, -1, :].unsqueeze(-1)
        lam = torch.matmul(h, query).squeeze(-1)  # [N, T, D] --> [N, T]
        lam = torch.softmax(lam, dim=1).unsqueeze(1)
        output = torch.matmul(lam, z).squeeze(1)  # [N, 1, T], [N, T, D] --> [N, 1, D]
        return output

# HAS BEEN REIMPLEMENTED
class MASTER(nn.Module):
    def __init__(self, d_feat=2, d_model=256, t_nhead=4, s_nhead=2,
                 T_dropout_rate=0.5, S_dropout_rate=0.5,
                 gate_input_start_index=0, gate_input_end_index=None, beta=None):
        super(MASTER, self).__init__()

        # Ensure they are set as instance variables
        self.gate_input_start_index = gate_input_start_index
        self.gate_input_end_index = gate_input_end_index

        # Set the start and end indices for gating input based on d_feat
        self.gate_input_start_index = min(self.gate_input_start_index, d_feat - 1)
        self.gate_input_end_index = min(self.gate_input_end_index or d_feat, d_feat)

        # Continue with the rest of the initialization as before
        self.d_gate_input = max(1, self.gate_input_end_index - self.gate_input_start_index)
        self.feature_gate = Gate(self.d_gate_input, 1, beta=beta)
        self.layers = nn.Sequential(
            nn.Linear(2, d_model),
            PositionalEncoding(d_model),
            TAttention(d_model=d_model, nhead=t_nhead, dropout=T_dropout_rate),
            SAttention(d_model=d_model, nhead=s_nhead, dropout=S_dropout_rate),
            TemporalAttention(d_model=d_model),
            nn.Linear(d_model, 1)
        )

    def forward(self, x):
        if x.shape[-1] == 1:
            x = x.expand(-1, -1, 2)

        src = x[:, :, :self.gate_input_start_index + 1]
        gate_input = x[:, :, self.gate_input_start_index:self.gate_input_end_index]

        # Apply Gate transformation
        gate_output = self.feature_gate(gate_input).expand(-1, src.size(1), -1)
        if src.shape == gate_output.shape:
            src = src * gate_output

        # Reshape `src` to match the Linear layer's expected input shape
        src = src.expand(-1, -1, 2)

        # Forward pass through layers
        for i, layer in enumerate(self.layers):
            src = layer(src)

        # Removing the previous reshape and gating logic which assumed time dimension
        output = self.layers(x)  # input directly
        output = output.squeeze(-1)

        return output

class MASTERModel(SequenceModel):
    def __init__(self, d_feat: int = 20, d_model: int = 64, t_nhead: int = 4, s_nhead: int = 2,
                 gate_input_start_index=0, gate_input_end_index=None, T_dropout_rate=0.5,
                 S_dropout_rate=0.5, beta=5.0, **kwargs):
        super(MASTERModel, self).__init__(**kwargs)
        self.d_model = d_model
        self.d_feat = d_feat
        self.gate_input_start_index = gate_input_start_index
        self.gate_input_end_index = gate_input_end_index
        self.T_dropout_rate = T_dropout_rate
        self.S_dropout_rate = S_dropout_rate
        self.t_nhead = t_nhead
        self.s_nhead = s_nhead
        self.beta = beta
        # print(f"Initializing MASTERModel with gate_input_start_index={self.gate_input_start_index} "f"and gate_input_end_index={self.gate_input_end_index}")
        self.init_model()

    def init_model(self):
        self.model = MASTER(
            d_feat=self.d_feat, d_model=self.d_model, t_nhead=self.t_nhead, s_nhead=self.s_nhead,
            T_dropout_rate=self.T_dropout_rate, S_dropout_rate=self.S_dropout_rate,
            gate_input_start_index=self.gate_input_start_index,
            gate_input_end_index=self.gate_input_end_index, beta=self.beta
        )
        super(MASTERModel, self).init_model()

# 3c. main

This is where data is split into training and testing sets. The data is then run through the model to make its predictions. This is where MASTER and AT-AT completes its purpose.

In [49]:
#################### TRAINING AND TESTING ####################

# Parameters where increase means better performance, but will cost more
d_model = 256
t_nhead = 4
s_nhead = 2

d_feat = 2
dropout = 0.5  # Can improve generalization if higher but not TOO high
beta = 5  # CHANGEABLE PARAMETER
n_epoch = 20  # Can improve performance if higher but not TOO high
lr = 8e-6  # Can improve performance if higher but not TOO high

# NOT CHANEGABLE PARAMETERS
gate_input_start_index = 0
gate_input_end_index = 2
GPU = 0
seed = 0
train_stop_loss_thred = 0.95

# Apply to each dataset
dl_train_data = pd.read_csv(f'data/{universe}/{universe}_dl_train.csv').drop(columns=['Date', 'Ticker'])
dl_valid_data = pd.read_csv(f'data/{universe}/{universe}_dl_valid.csv').drop(columns=['Date', 'Ticker'])
dl_test_data = pd.read_csv(f'data/{universe}/{universe}_dl_test.csv')

dl_test_data_numeric = dl_test_data.select_dtypes(include=[float, int])
lookback_window = 1
num_samples = len(dl_test_data_numeric)
dl_test_tensor = torch.tensor(dl_test_data_numeric.values.reshape(num_samples, lookback_window, 2), dtype=torch.float32)

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Set up the model
model = MASTERModel(
    d_feat=d_feat, d_model=d_model, t_nhead=t_nhead, s_nhead=s_nhead,
    T_dropout_rate=dropout, S_dropout_rate=dropout,
    beta=beta, gate_input_end_index=gate_input_end_index, gate_input_start_index=gate_input_start_index,
    n_epochs=n_epoch, lr=lr, GPU=GPU, seed=seed, train_stop_loss_thred=train_stop_loss_thred,
    save_path='model/', save_prefix=universe
)

model.fit(dl_train_data, dl_valid_data)
print("Model Trained.")

# Test the model and evaluate metrics
predictions, metrics = model.predict(dl_test_tensor)
predictions.index = dl_test_data['Date'][:len(predictions)]
print("Metrics:", metrics)

Epoch 0, train_loss 3191805467000533.500000, valid_loss 3325165369200109.000000 
Epoch 1, train_loss 3511633693031731.000000, valid_loss 3324876505295218.500000 
Epoch 2, train_loss 4296057642371952.000000, valid_loss 3324053214852749.500000 
Epoch 3, train_loss 4150655871411154.500000, valid_loss 3322334435092535.000000 
Epoch 4, train_loss 3985706384957897.500000, valid_loss 3319290353694039.000000 
Epoch 5, train_loss 4239257348826819.000000, valid_loss 3314431974321398.000000 
Epoch 6, train_loss 3633973725029052.000000, valid_loss 3307202402045674.500000 
Epoch 7, train_loss 3690435436266677.500000, valid_loss 3297023821341499.000000 
Epoch 8, train_loss 3626454526243967.000000, valid_loss 3283487967614004.500000 
Epoch 9, train_loss 3978324594420180.500000, valid_loss 3266829890873002.000000 
Epoch 10, train_loss 3337798932659207.000000, valid_loss 3248400948563139.000000 
Epoch 11, train_loss 3838048805386616.000000, valid_loss 3229380169394161.500000 
Epoch 12, train_loss 36128