# Introduction
- This is one of the submitted solutions to Artificial Intelligence Final Project
- Team Member: Zhi-jun Liu, Meng-qi Cao

This notebook contains the CNN-RNN Solution (**best case** private: 0.00140 | public : 0.00149)

We modified learning rate during training dynamically(by observation). With different hyperparameter, you get a flucatuation of about 0.00001.

## Prerequisites
- python 3.7
- Pytorch == 1.0
- Pandas == 0.23.4
- Numpy == 1.15.4
- Tqdm == 4.28.1

## Preparation

In [11]:
import torch
import pandas as pd
import numpy as np
import time
from tqdm import tnrange, tqdm, tqdm_notebook, trange
import torch.nn as nn

In [12]:
device = torch.device("cpu")
default_cpu_tensor_type = torch.FloatTensor
torch.set_default_tensor_type(default_cpu_tensor_type)

In [13]:
# device = torch.device("cuda:0")
# torch.cuda.set_device(device.index)
# default_gpu_tensor_type = torch.cuda.FloatTensor
# torch.set_default_tensor_type(default_gpu_tensor_type)

## Loading Data

In [14]:
class CSVReader:
    dataitems = ["Volume", "BidVolume1", "AskVolume1", "AskPrice1", "MidPrice"]
    def __init__(self, training_set="./train_data.csv", testing_set="./test_data.csv"):
        self.Train = pd.read_csv(training_set,
                                 index_col="Date",
                                 usecols=[
                                     "Date", "Time",
                                     "MidPrice", "LastPrice",
                                     "Volume", "BidPrice1",
                                     "BidVolume1", "AskPrice1",
                                     "AskVolume1"
                                 ])

        self.Test = pd.read_csv(testing_set,
                                index_col="Date",
                                usecols=[
                                    "Date", "Time",
                                    "MidPrice", "LastPrice",
                                    "Volume", "BidPrice1",
                                    "BidVolume1", "AskPrice1",
                                    "AskVolume1"
                                ])


        self.Train = self.Train.sort_index()
        self.Test = self.Test.sort_index()
        def hour(s):
            q = [float(i) for i in s.split(":")]
            return q[0] + q[1] / 60

        self.Train["Hour"] = self.Train["Time"].map(hour)

        self.Test["Hour"] = self.Test["Time"].map(hour)

        TimeStampCount = self.Train["Time"].groupby("Date").count()
        TimeStampCount = TimeStampCount.sort_values()

        self.TrainDates = self.Train.index.unique().tolist()
        self.DangerousDates = TimeStampCount[TimeStampCount > 10000].index.tolist()
        self.FilteredDates = [date for date in self.TrainDates if date not in self.DangerousDates]

        self.TrainSet = {}

        self.AM = self.Train[self.Train["Hour"] < 11.70]
        self.PM = self.Train[self.Train["Hour"] > 12.70]

        for date in self.FilteredDates:
            self.TrainSet[f"{date}|AM"] = self.AM.loc[date]
            self.TrainSet[f"{date}|PM"] = self.PM.loc[date]

        # Splitting Testing Set
        self.TestingSet = []

        for begin in range(0, len(self.Test), 10):
            self.TestingSet.append(self.Test.iloc[begin: begin + 10])

    def training_dates(self):
        """
        Returning all training set dates
        """
        return self.FilteredDates

    def get_training_numpy(self, idx):
        """
        Returning training set at idx, also the am / pm / date infomation trailing it.
        [T, Feature]
        """
        key = list(self.TrainSet.keys())[idx]
        pandadb = self.TrainSet[key][self.dataitems]
        return key, pandadb.values

    def get_testing_numpy(self, idx):
        """
        Returning testing set at idx, also the am / pm / date infomation trailing it.
        [T, Feature]
        """
        pandadb = self.TestingSet[idx][self.dataitems]
        return pandadb.values

    def training_count(self):
        """
        Returning the count of all available sub training set, including morning and afternoon
        """
        return len(self.TrainSet)

    def testing_count(self):
        """
        Returning the count of all testing instance
        """
        return len(self.TestingSet) 

In [15]:
class DataLoader:
    dataitems = ["BidVolume1", "AskVolume1", "DiffVolume", "DiffBidVolume1", "DiffAskVolume1", "OneHot&7"]
    def __init__(self, csvreader=CSVReader()):
        self.Train = []
        self.Test = []

        self.reader = csvreader
        for idx in range(self.reader.training_count()):
            time, npdata = self.reader.get_training_numpy(idx)
            self.Train.append(torch.from_numpy(npdata).contiguous().to(device=device, dtype=torch.float32))

        for idx in range(self.reader.testing_count()):
            npdata = self.reader.get_testing_numpy(idx)
            self.Test.append(torch.from_numpy(npdata).contiguous().to(device=device, dtype=torch.float32))

        self.valid_idx = []
        self.train_idx = [_ for _ in range(0, len(self.Train))]
        self.set_validation_set([20, 31, 42, 51])
    
    def sample_processed_batch(self, batch_size=32, source="train", length=40):
        if source == "train":
            choice = np.random.choice(self.train_idx)
        else:
            choice = np.random.choice(self.valid_idx)

        L = len(self.Train[choice]) # []
        IDX = torch.randint(1, L - length - 21, (batch_size,)) # [Batch]
        A = self.Train[choice] # [Day Length, Features = 5]
        Q = torch.arange(0, length + 21, dtype=torch.long).unsqueeze(0) # [1, L + 21]
        P = IDX.unsqueeze(1) # [Batch, 1]
        W = A[P + Q] # [Batch, L + 21, Features = 5]
        # BEGIN INSERT
        
        C = torch.arange(1, 21, dtype=torch.long).unsqueeze(0) # [1, 20]
        V = torch.arange(0, length, dtype=torch.long).unsqueeze(1) # [L, 1]
        B = C + V 
        T = W[:, B, -1] # [Batch, L, 20]
        Target = T.mean(dim=-1) - W[:, :length, -1] # [Batch, L]
        
        # END INSERT
        W = W[:, :-21, :4] # [Batch, L, Features = 4]
        M = A[P + Q - 1][:, :-21, :4] # [Batch, L, Features = 4]
        
        D = W - M
        
        NormalFeatures = W[:, :, 1:3]
        DifferentialFeatures = D[:, :, :3]
        DeltaPrice = torch.round(D[:, :, 3: 4] * 1000).to(torch.long) + 3
        DeltaPrice[DeltaPrice > 3] = 3
        DeltaPrice[DeltaPrice < -3] = -3
        eye = torch.eye(7)
        DeltaPriceOneHot = eye[DeltaPrice].squeeze(dim=-2)
        R = torch.cat([NormalFeatures, DifferentialFeatures, DeltaPriceOneHot], dim=-1)
        return R, Target
    
    def get_test_batch(self, batch_size=8):
        # Test sequence are all of length 9, they are all from the valid set.
        choice = np.random.choice(self.valid_idx)
        
        L = len(self.Train[choice])
        IDX = torch.randint(1, L - 29, (batch_size,))
        A = self.Train[choice]
        Q = torch.arange(0, 29, dtype=torch.long).unsqueeze(0)
        P = IDX.unsqueeze(1)
        W = A[P + Q]
        Target = W[:, 9:, -1].mean(dim=-1) - W[:, 9, -1]
        M = A[P + Q - 1]
        
        D = W - M
        
        NormalFeatures = W[:, :, 1:3]
        DifferentialFeatures = D[:, :, :3]
        DeltaPrice = torch.round(D[:, :, 3: 4] * 1000).to(torch.long) + 3
        DeltaPrice[DeltaPrice > 3] = 3
        DeltaPrice[DeltaPrice < -3] = -3
        eye = torch.eye(7)
        DeltaPriceOneHot = eye[DeltaPrice].squeeze(dim=-2)
        R = torch.cat([NormalFeatures, DifferentialFeatures, DeltaPriceOneHot], dim=-1)
        return R, Target
    
    def get_test_batch_from_train(self, batch_size=8):
        # Test sequence are all of length 9, they are all from the valid set.
        choice = np.random.choice(self.valid_idx)
        
        L = len(self.Train[choice])
        IDX = torch.randint(1, L - 29, (batch_size,))
        A = self.Train[choice]
        Q = torch.arange(0, 29, dtype=torch.long).unsqueeze(0)
        P = IDX.unsqueeze(1)
        W = A[P + Q] # [Batch, 29, 4]
        Target = W[:, 9:, -1].mean(dim=-1) - W[:, 9, -1]
        
        M = A[P + Q - 1]
        
        D = W - M
        
        NormalFeatures = W[:, :, 1:3]
        DifferentialFeatures = D[:, :, :3]
        DeltaPrice = torch.round(D[:, :, 3: 4] * 1000).to(torch.long) + 3
        DeltaPrice[DeltaPrice > 3] = 3
        DeltaPrice[DeltaPrice < -3] = -3
        eye = torch.eye(7)
        DeltaPriceOneHot = eye[DeltaPrice].squeeze(dim=-2)
        R = torch.cat([NormalFeatures, DifferentialFeatures, DeltaPriceOneHot], dim=-1)
        return R, Target

    def get_train_continuous(self, batch_size=32, length=40):
        return self.sample_processed_batch(batch_size, length=length)

    def get_valid_continuous(self, batch_size=32, length=40):
        return self.sample_processed_batch(batch_size, source="valid", length=length)
    
    def get_test(self):
        # Returns all 1000 testing samples
        Test = torch.stack(self.Test, dim=0) # [1000, 10, 5]
        W = Test[:, 1:, :]
        M = Test[:, :-1, :]
        D = W - M
        End = Test[:, -1, -1]
        
        NormalFeatures = W[:, :, 1:3]
        DifferentialFeatures = D[:, :, :3]
        DeltaPrice = torch.round(D[:, :, 3: 4] * 1000).to(torch.long) + 3
        DeltaPrice[DeltaPrice > 3] = 3
        DeltaPrice[DeltaPrice < -3] = -3
        eye = torch.eye(7)
        DeltaPriceOneHot = eye[DeltaPrice].squeeze(dim=-2)
        R = torch.cat([NormalFeatures, DifferentialFeatures, DeltaPriceOneHot], dim=-1)
        return R, End
        
    def set_validation_set(self, lst):
        self.valid_idx = lst
        self.train_idx.clear()
        for idx in range(0, len(self.Train)):
            if idx not in lst:
                self.train_idx.append(idx)


In [16]:
loader = DataLoader()

## Model Definition

In [17]:
class Prenet(nn.Module):
    def __init__(self, in_channel=12, out_channel=32, kernel_size=2):
        super(Prenet, self).__init__()
        self.conv = nn.Conv1d(in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, )
        self.activation = nn.ReLU()
        self.norm = nn.BatchNorm1d(num_features=out_channel)
        
    def forward(self, x):
        # [Batch, Length, Features = 12]
        return self.activation(
            self.norm(
                self.conv(x.transpose(-1, -2))
            )
        ).transpose(-1, -2)

In [18]:
class Prednet(nn.Module):
    def __init__(self, in_channel=32, hidden_size=128, num_layers=1, dropout=0.2):
        super(Prednet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size=in_channel, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.outlayer = nn.Linear(in_features=hidden_size, out_features=1)
        self.activation = nn.ReLU()
        self.scale = 0.004
        self.h = nn.Parameter(torch.zeros(num_layers, 1, hidden_size))
    
    def forward(self, x):
        # x: [Batch, Length - 1, Features = in_channel]
        h_0 = self.h.expand(self.num_layers, x.size(0), self.hidden_size).contiguous()
        out, h_n = self.rnn(x, h_0) # [Batch, Length - 1, Hidden], [Batch, Hidden]
        x = self.activation(out)
        x = self.outlayer(x) * self.scale
        return x.squeeze(-1) # [Batch, Length - 1]

In [19]:
class RNNModel(nn.Module):
    def __init__(self, feature_n=12, cnn_channel=32, kernel_size=2, rnn_size=128, rnn_dropout=0.0, num_layers=1):
        super(RNNModel, self).__init__()
        self.prenet = Prenet(in_channel=feature_n, out_channel=cnn_channel, kernel_size=kernel_size)
        self.postnet = Prednet(in_channel=cnn_channel, hidden_size=rnn_size, dropout=rnn_dropout, num_layers=num_layers)
    
    def forward(self, x):
        return self.postnet(self.prenet(x))

## Model Training

In [20]:
class Trainer:
    kernel_size=3
    model = RNNModel(
        feature_n=12, 
        cnn_channel=32,
        kernel_size=kernel_size, 
        rnn_size=64,
        rnn_dropout=0.5,
        num_layers=1
    )
    
    kernel_displacement = kernel_size - 1
    
    learning_rate = 0.00000000001
    optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    loader = DataLoader()
    
    batch_size = 8
    
    seq_length = 512
    
    epoch = 1000
    
    evaluate_epoch = 5
    
    smoothed_rmse = 0.0015
    smoothed_evaluation = 0.0015
    def get_train_batch(self):
        Feat, Target = self.loader.get_train_continuous(self.batch_size, self.seq_length)
        Feat[:, :, :5] = torch.log(1.0 + Feat[:, :, :5].relu()) / 18.0
        return Feat, Target[:, self.kernel_displacement:]
    
    def train_single_batch(self):
        Feat, Target = self.get_train_batch()
        Estimate = self.model(Feat)
        Loss = (Estimate - Target).pow(2).mean().sqrt()
        return Loss
    
    def train(self, lr=learning_rate):
        bar = tnrange(self.epoch)
        self.optim.lr = lr
        for idx in bar:
            self.optim.zero_grad()
            loss = self.train_single_batch()
            loss.backward()
            self.optim.step()
            self.smoothed_rmse = loss.item() * 0.01 + 0.99 * self.smoothed_rmse
            self.smoothed_evaluation = self.evaluation() * 0.01 + 0.99 * self.smoothed_rmse
            bar.set_postfix(loss=f"{self.smoothed_rmse:.7f}", e=f"{self.smoothed_evaluation:.7f}")
        
    def evaluation(self):
        with torch.no_grad():
            self.model.eval()
            Feat, Target = self.loader.get_valid_continuous(self.batch_size, self.seq_length)
            Feat[:, :, :5] = torch.log(1.0 + Feat[:, :, :5].relu()) / 18.0
            Target = Target[:, self.kernel_displacement:]
            Estimate = self.model(Feat)
            Loss = (Estimate - Target).pow(2).mean().sqrt()
            self.model.train()
            return Loss.item()
    
    def test(self):
        with torch.no_grad():
            self.model.eval()
            Feat, End = self.loader.get_test()
            Feat[:, :, :5] = torch.log(1.0 + Feat[:, :, :5].relu()) / 18.0
            Estimate = self.model(Feat)
            Estimate = Estimate[:, -1]
            self.model.train()
            return Estimate.detach().cpu().numpy() + End.cpu().numpy(), Estimate.detach().cpu().numpy() 
    

In [21]:
trainer = Trainer()

In [None]:
trainer.train()

## Output Results

In [None]:
estimate, delta = trainer.test()
import pandas as pd
ANS = pd.DataFrame(index=np.arange(143, 1001, dtype=int), data=estimate[142:], columns=["midprice"])
ANS.index.name = "caseid"
ANS.to_csv(f"submit_{np.random.randint(100)}.csv")