In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch_butterfly import Butterfly
from DataLoader import load_data
import matplotlib.pyplot as plt

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [3]:
def limited_test(model, loader, max_iterations=float("inf"), device="cuda" if torch.cuda.is_available() else "cpu"):
    total_output, total_targets = torch.tensor([], device="cpu"), torch.tensor([], device="cpu")
    
    for iteration, data in enumerate(loader):
        sample_dataseqs, sample_targets = data
        sample_output = model(sample_dataseqs)
        
        total_output = torch.cat((total_output, sample_output.cpu()))
        total_targets = torch.cat((total_targets, sample_targets.cpu()))
    
        if iteration >= max_iterations:
            break
    
    return total_output, total_targets

# Data

In [4]:
train_data, test_data = load_data("spx.csv", sequences_length=80)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True, num_workers=64)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=True, num_workers=64)

  cpuset_checked))


# Archetype

In [5]:
# Abstract class, must overload __init__.layers
# TODO: select loss_fn (MSE?)
class MarketPredictor(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.loss_fn = None
        self.layers = None
        self.feature_extractor = None
        
    def forward(self, x, device="cuda" if torch.cuda.is_available() else "cpu"):
        #assert None not in [self.loss_fn, self.layers, self.feature_extractor]
        x = x.to(device)
        x = x.unsqueeze(dim=1)
        features = self.feature_extractor(x)
        features = features.squeeze(dim=3).squeeze(dim=1)
        return self.layers(features).squeeze(dim=1)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=0.01)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        features, target = train_batch
        pred = self(features)
        loss = self.loss_fn(target, pred)
        
        return loss
    
    def train_dataloader(self):
        return train_loader

# Basic MLP

In [6]:

class BasicMLP(MarketPredictor):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=40, kernel_size=(3,2), dtype=torch.float64, device=device),  # size: 80 -> 78
            nn.ReLU(),
            nn.Conv2d(in_channels=40, out_channels=20, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 78 -> 76
            nn.ReLU(),
            nn.Conv2d(in_channels=20, out_channels=10, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 76 -> 74
            nn.ReLU(),
            nn.Conv2d(in_channels=10, out_channels=5, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 74 -> 72
            nn.ReLU(),
            nn.Conv2d(in_channels=5, out_channels=1, kernel_size=(3,1), dtype=torch.float64, device=device)  # size: 72 -> 70
        )
        self.loss_fn = nn.MSELoss()
        self.layers = nn.Sequential(
            nn.Linear(70, 50, dtype=torch.float64, device=device),
            nn.ReLU(),
            nn.Linear(50, 10, dtype=torch.float64, device=device),
            nn.ReLU(),
            nn.Linear(10, 1, dtype=torch.float64, device=device)
        )

In [17]:
model = BasicMLP()
trainer = pl.Trainer(gpus=1, max_epochs=30)
trainer.fit(model, train_loader)

In [8]:
# load model, if necessary
#model = BasicMLP.load_from_checkpoint(checkpoint_path="lightning_logs/version_29/checkpoints/epoch=29-step=19889.ckpt")

In [8]:
model_output, model_target = limited_test(model=model, loader=test_loader)
print(f"MSE = {(model_output-model_target) @ (model_output-model_target)}")
print(f"STD = {torch.std(model_target - model_output)}")
print(f"mean = {torch.mean(model_target - model_output)}")

tensor([ 410.8533,  291.5145, 1446.8936,  ...,  240.5177, 2127.6576,
        1003.7026], dtype=torch.float64, grad_fn=<CatBackward>)
tensor([ 412.4800,  295.3200, 1450.5833,  ...,  245.5833, 2151.3300,
         990.6400], dtype=torch.float64)
MSE = 627999.3003460545
STD = 21.881127655537888
mean = 7.397908985333262


# Butterfly JL

In [9]:
def extract_matrix_from_linear_transform(linear_transform, in_size: int, device="cuda" if torch.cuda.is_available() else "cpu"):
    # Collect i-th column of matrix by applying linear_transform to i-th member of the standard basis
    return torch.stack([linear_transform(standard_basis_vector) for standard_basis_vector in torch.eye(in_size, device=device)], dim=1)

In [10]:
class ButterflySELL(nn.Module):
    def __init__(self, replaced_linear_layer: nn.Linear, intrinsic_size: int = None, device="cuda" if torch.cuda.is_available() else "cpu"):
        """
        A linear layer W may be replaced with this encoder-decoder module (based on Johnsonn-Lindenstrauss) with identical in- and out- sizes.
        :param replaced_linear_layer: linear layer to replace
        :param intrinsic_size: underlying "true" size the data should be (i.e. the size of the encoder-decoder middle layer)
        """
        super().__init__()
        
        in_size = replaced_linear_layer.in_features
        out_size = replaced_linear_layer.out_features
        if not intrinsic_size:
            #intrinsic_size = int(math.log2(in_size))
            intrinsic_size = max(min(in_size, out_size) // 4, max(in_size, out_size) // 8, 2)
        
        self.layers = nn.Sequential(
            Butterfly(in_size=in_size, out_size=intrinsic_size, bias=False).to(device),                                  #J2.T
            nn.Linear(in_features=intrinsic_size, out_features=intrinsic_size, dtype=torch.float, device=device),                           #W_shrunken
            Butterfly(in_size=intrinsic_size, out_size=out_size, bias=False).to(device)                                  #J1
        )
        
        # extract underlying matrices from J1 and J2.T
        J2 = extract_matrix_from_linear_transform(self.layers[0], in_size=in_size).T
        J1 = extract_matrix_from_linear_transform(self.layers[2], in_size=intrinsic_size)
        
        # correct W_shrunken's initial tensor to J1.T @ W @ J2
        self.layers[1].weight = nn.Parameter(J1.T @ replaced_linear_layer.weight.data.to(torch.float) @ J2)
        #self.layers[1].bias = nn.Parameter(J1.T @ replaced_linear_layer.bias.data @ J2)
    
    def forward(self, x):
        x = x.to(torch.float)
        return self.layers(x).to(torch.double)

In [11]:
linear_layers = [model.layers[0], model.layers[2], model.layers[4]]
assert(all([type(layer) == torch.nn.modules.linear.Linear for layer in linear_layers]))
del linear_layers

In [12]:
for layer_idx in [0,2,4]:
    model.layers[layer_idx] = ButterflySELL(model.layers[layer_idx])

In [13]:
model_output, model_target = limited_test(model=model, loader=test_loader)
print(f"MSE = {(model_output-model_target) @ (model_output-model_target)}")
print(f"STD = {torch.std(model_target - model_output)}")
print(f"mean = {torch.mean(model_target - model_output)}")

MSE = 1814521637.6405206
STD = 617.315806694106
mean = 1076.8405824474219


# Butterfly encoder-decoder

In [14]:
trainer = pl.Trainer(gpus=1, max_epochs=30)
trainer.fit(model, train_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type       | Params
-------------------------------------------------
0 | feature_extractor | Sequential | 3.5 K 
1 | loss_fn           | MSELoss    | 0     
2 | layers            | Sequential | 3.5 K 
-------------------------------------------------
7.0 K     Trainable params
0         Non-trainable params
7.0 K     Total params
0.028     Total estimated model params size (MB)


Training: -1it [00:00, ?it/s]

In [None]:
# load model, if necessary
#model = BasicMLP()
#for layer_idx in [0,2,4]:
#    model.layers[layer_idx] = ButterflySELL(model.layers[layer_idx])
#model = model.load_from_checkpoint(checkpoint_path="lightning_logs/version_32/checkpoints/epoch=29-step=19889.ckpt")

In [29]:
model_output, model_target = limited_test(model=model, loader=test_loader)
print(f"MSE = {(model_output-model_target) @ (model_output-model_target)}")
print(f"STD = {torch.std(model_target - model_output)}")
print(f"mean = {torch.mean(model_target - model_output)}")

MSE = 336249.6474888823
STD = 16.83671701760607
mean = 1.4855059475944608


# Butterfly encoder-decoder with nonlinearities

In [30]:
for linear_layer_idx in [0,2,4]:
    butterfly_sequential = model.layers[linear_layer_idx].layers
    model.layers[linear_layer_idx].layers = nn.Sequential(
        butterfly_sequential[0],
        nn.ReLU(),
        butterfly_sequential[1],
        nn.ReLU(),
        butterfly_sequential[2]
    )

In [None]:
# load model, if necessary
#model = BasicMLP()
#for layer_idx in [0,2,4]:
#    model.layers[layer_idx] = ButterflySELL(model.layers[layer_idx])
#model = model.load_from_checkpoint(checkpoint_path="lightning_logs/version_32/checkpoints/epoch=29-step=19889.ckpt")

In [31]:
trainer = pl.Trainer(gpus=1, max_epochs=30)
trainer.fit(model, train_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type       | Params
-------------------------------------------------
0 | feature_extractor | Sequential | 3.5 K 
1 | loss_fn           | MSELoss    | 0     
2 | layers            | Sequential | 3.5 K 
-------------------------------------------------
7.0 K     Trainable params
0         Non-trainable params
7.0 K     Total params
0.028     Total estimated model params size (MB)


Training: -1it [00:00, ?it/s]

In [34]:
model_output, model_target = limited_test(model=model, loader=test_loader)
print(f"MSE = {(model_output-model_target) @ (model_output-model_target)}")
print(f"STD = {torch.std(model_target - model_output)}")
print(f"mean = {torch.mean(model_target - model_output)}")

MSE = 329155.8017929223
STD = 16.6508277388509
mean = -1.5506381859425855


# Linear encoder-decoder

In [36]:

class LinearAutoencoders(MarketPredictor):
    def __init__(self, device = "cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=40, kernel_size=(3,2), dtype=torch.float64, device=device),  # size: 80 -> 78
            nn.ReLU(),
            nn.Conv2d(in_channels=40, out_channels=20, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 78 -> 76
            nn.ReLU(),
            nn.Conv2d(in_channels=20, out_channels=10, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 76 -> 74
            nn.ReLU(),
            nn.Conv2d(in_channels=10, out_channels=5, kernel_size=(3,1), dtype=torch.float64, device=device),  # size: 74 -> 72
            nn.ReLU(),
            nn.Conv2d(in_channels=5, out_channels=1, kernel_size=(3,1), dtype=torch.float64, device=device)  # size: 72 -> 70
        )
        self.loss_fn = nn.MSELoss()
        self.layers = nn.Sequential(
            nn.Linear(70, 35, dtype=torch.float64, device=device),
            nn.Linear(35, 50, dtype=torch.float64, device=device),
            nn.ReLU(),
            nn.Linear(50, 5, dtype=torch.float64, device=device),
            nn.Linear(5, 10, dtype=torch.float64, device=device),
            nn.ReLU(),
            nn.Linear(10, 1, dtype=torch.float64, device=device)
        )

In [None]:
# load model, if necessary
#model = LinearAutoencoders()
#model = LinearAutoencoders.load_from_checkpoint(checkpoint_path="lightning_logs/version_34/checkpoints/epoch=29-step=19889.ckpt")

In [37]:
linear_ED_model = LinearAutoencoders()
trainer = pl.Trainer(gpus=1, max_epochs=30)
trainer.fit(linear_ED_model, train_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type       | Params
-------------------------------------------------
0 | feature_extractor | Sequential | 3.5 K 
1 | loss_fn           | MSELoss    | 0     
2 | layers            | Sequential | 4.6 K 
-------------------------------------------------
8.1 K     Trainable params
0         Non-trainable params
8.1 K     Total params
0.032     Total estimated model params size (MB)


Training: -1it [00:00, ?it/s]

In [38]:
linear_ED_model.cuda()

LinearAutoencoders(
  (feature_extractor): Sequential(
    (0): Conv2d(1, 40, kernel_size=(3, 2), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(40, 20, kernel_size=(3, 1), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(20, 10, kernel_size=(3, 1), stride=(1, 1))
    (5): ReLU()
    (6): Conv2d(10, 5, kernel_size=(3, 1), stride=(1, 1))
    (7): ReLU()
    (8): Conv2d(5, 1, kernel_size=(3, 1), stride=(1, 1))
  )
  (loss_fn): MSELoss()
  (layers): Sequential(
    (0): Linear(in_features=70, out_features=35, bias=True)
    (1): Linear(in_features=35, out_features=50, bias=True)
    (2): ReLU()
    (3): Linear(in_features=50, out_features=5, bias=True)
    (4): Linear(in_features=5, out_features=10, bias=True)
    (5): ReLU()
    (6): Linear(in_features=10, out_features=1, bias=True)
  )
)

In [39]:
model_output, model_target = limited_test(model=linear_ED_model, loader=test_loader)
print(f"MSE = {(model_output-model_target) @ (model_output-model_target)}")
print(f"STD = {torch.std(model_target - model_output)}")
print(f"mean = {torch.mean(model_target - model_output)}")

MSE = 699107.8794139728
STD = 23.469228254428575
mean = -6.567578513115662
