In [329]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from torch import nn

## Text data preprocessing

In [260]:
path = Path.cwd()/'data'/'gasoline.csv'
data = pd.read_csv(path)
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
oil_price = data[['OT', 'Final_Search_2']]
oil_price.columns = ['price', 'news']
oil_price = oil_price.loc["2003-01-01":]
print(oil_price.shape)

(1113, 2)


In [262]:
def trim_sentence(sentence):
    clean = re.sub(r'^Available facts are as follows:\s*', "", sentence)
    return clean
def remove_source(sentence):
    clean = re.sub(r'\[Source(:.*?)?\]', "", sentence)
    return clean
def remove_space(sentence):
    clean = re.sub(r";", " ", sentence)
    clean = re.sub(r"\s+", " ", clean).strip()
    return clean
def preprocess_text(sentence, date, price):
    clean = trim_sentence(sentence)
    clean = remove_source(clean)
    clean = remove_space(clean)
    clean = f"today price {price}. " + f"today is {date.year}-{date.month}-{date.day}. " + clean 
    return clean

In [264]:
oil_price.news = list(map(preprocess_text, oil_price.news, oil_price.index, oil_price.price))

In [266]:
oil_price.news.iloc[100]

"today price 1.956. today is 2004-12-6. 2004-11-29: The price of oil and natural gas has a significant impact on the economy, particularly in regions that rely heavily on these energy sources. The cost of wind power is decreasing, making it a more viable alternative to fossil fuels. The gasoline tax accounts for a significant portion of the retail gasoline price. 2004-11-22: The United States' oil production peaked in 1970, as predicted by Hubbert's theory. In the case of crude oil, the amount obtainable from fat was used as fuel for lamps. The price of oil affects the investment and production of hydrocarbons. In November 2004, oil price pressures eased due to increased assurances about the adequacy of US winter fuel supplies."

In [268]:
tr_size = int(oil_price.shape[0]*0.5)
test_size = oil_price.shape[0] - tr_size
train, test = oil_price[:tr_size], oil_price[tr_size:]
print(train.shape)
print(test.shape)

(556, 2)
(557, 2)


In [270]:
scaler = MinMaxScaler()
scaler.fit(train.price.values.reshape((-1,1)))

In [272]:
train['price'] = scaler.transform(train.price.values.reshape((-1,1)))
test['price'] = scaler.transform(test.price.values.reshape((-1,1)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['price'] = scaler.transform(train.price.values.reshape((-1,1)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['price'] = scaler.transform(test.price.values.reshape((-1,1)))


## Custom Dataset

In [322]:
class CustomDataset(Dataset):
    def __init__(self, data, tokeniser, lag=7):
        """
        Args:
            data(np.ndarray): dim is (N , 1)
            lag(int): context or window length
        Remarks:
            Max length of token is selected 512 with appropriate padding.
        """
        self.X = list()
        sentences = list()
        self.y = list()
        for i in range(len(data) - lag):
            x = data[i : i + lag, 0]
            y = data[i + lag, 0]
            self.X.append(x.flatten())
            self.y.append(y)
            sentences.append(data[i+lag, 1])
        self.X = torch.tensor(np.array(self.X, dtype=np.float32), dtype=torch.float32)
        self.y = torch.tensor(np.array(self.y, dtype=np.float32), dtype=torch.float32)
        self.tokens = list()
        for text in sentences:
            ids = tokeniser.encode(text, max_length=512, truncation=True, padding='max_length')
            self.tokens.append(ids)
        self.tokens = torch.tensor(self.tokens, dtype=torch.long)
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.tokens[idx], self.y[idx]      

## Model Building

In [387]:
class CustomModel(nn.Module):
    def __init__(self, in_size, hid_size):
        super(CustomModel, self).__init__()
        # Bert with last custom layer trainable
        self.bert = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.bert.pooler.dense = nn.Linear(768, 1, bias=True)
        self.bert.pooler.activation = nn.Sigmoid()
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=in_size,
            hidden_size=hid_size,
            batch_first=True,
            proj_size=1,
        )

    def forward(self, x, tokens):
        y_bert = self.bert(tokens)
        y_bert = y_bert.pooler_output
        y_lstm, _ = self.lstm(x)
        y_pred = y_lstm + y_bert
        return y_pred

In [402]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in /home/arnab/.cache/torch/hub/huggingface_pytorch-transformers_main


In [428]:
def train_model(
    model,
    optimiser,
    loss_fn,
    epochs,
    batch,
    data,
    lag,
    tokeniser, 
    device
):
    dataset = CustomDataset(data, tokeniser, lag)
    loader = DataLoader(dataset, batch_size=batch)
    tr_losses = list()
    val_losses = list()
    model.to(device)

    for i in range(epochs):
        train_loss = list()
        val_loss = list()
        for j, (X, tokens, y) in enumerate(loader):
            n = X.shape[0]
            tr_size = int(n*0.9)
            y = y.unsqueeze(0)
            X, tokens, y = X.to(device), tokens.to(device), y.to(device)
            X_train, tokens_train, y_train = X[:tr_size, :],tokens[:tr_size, :], y[:tr_size]
            X_val, tokens_val, y_val = X[tr_size:, :], tokens[tr_size:, :], y[tr_size:]

            y_pred = model(X_train, tokens_train)
            loss = loss_fn(y_pred, y_train)
            print(loss.item())

    #         optimiser.zero_grad()
    #         loss.backward()
    #         optimiser.step()
    #         train_loss.append(loss.item())
    #         # if (j+1)%3 == 0:
    #         #     print(f"Epoch - {i+1}, step - {j+1}, train_loss = {loss.item()}")
    #         with torch.no_grad():
    #             y_pred_val = model(X_val)
    #             val_loss.append(loss_fn(y_pred_val, y_val).item())
    #     tr_losses.append(np.mean(train_loss))
    #     val_losses.append(np.mean(val_loss))
    # plt.plot(tr_losses, label="Train Loss")
    # plt.plot(val_losses, label="Validation Loss")
    # plt.legend()
    # plt.title(f"Window Size = {lag}")
    # plt.show()

In [None]:
batch = 64
epochs = 2
# lags = list(range(4,12))
lags = [7]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for lag in lags:
    model = CustomModel(in_size=lag, hid_size=10)
    optimiser = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()
    train_model(model, optimiser, loss_fn, epochs, batch, train.values, lag, tokenizer, device)

Using cache found in /home/arnab/.cache/torch/hub/huggingface_pytorch-transformers_main
  return F.mse_loss(input, target, reduction=self.reduction)


0.29455649852752686
0.15126529335975647
0.031220706179738045
0.023149440065026283
0.10413531213998795
