In [1]:
from KGLE_Dataset import *
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import random
from torch.utils.data import WeightedRandomSampler

In [2]:
torch.manual_seed(42)
np.random.seed(42)
# torch.use_deterministic_algorithms(True)
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(42)

<torch._C.Generator at 0x28d769354b0>

In [3]:
train_set = KGLE_Dataset("./data/embed_trainvf.csv")
# weights = [label for (item, label) in train_set]


In [4]:
weights = np.loadtxt("./data/embed_trainvf.csv", delimiter=",", skiprows=1)
weights = weights[:, -1]

In [5]:
weights.shape

(1495471,)

In [6]:
sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_set), replacement=True)

In [23]:
train_load = DataLoader(dataset=train_set,
                        batch_size=32,
#                         sampler=sampler,
                        shuffle=True,
                        # num_workers=8,
                        worker_init_fn=seed_worker,
                        generator=g,)

valid_set = KGLE_Dataset("./data/embed_validvf.csv")
valid_load = DataLoader(dataset=valid_set,
                        batch_size=32,
                        # num_workers=8,
                        shuffle=True,
                        worker_init_fn=seed_worker,
                        generator=g,)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
len(train_set)

1495471

In [15]:
class TravelRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        origin_call_dim = 6
        origin_stand_dim = 5
        taxi_id_dim = 5
        year_dim = 2
        wk_of_yr_dim = 5
        wk_day_dim = 5
        hr_dim = 5
        
        self.embed_origin_call: nn.Module = nn.Embedding(29027, origin_call_dim)
        self.embed_origin_stand: nn.Module = nn.Embedding(64, origin_stand_dim)
        self.embed_taxi_id: nn.Module = nn.Embedding(448, taxi_id_dim)
        self.embed_year: nn.Module = nn.Embedding(2, year_dim)
        self.embed_wk_of_yr: nn.Module = nn.Embedding(52, wk_of_yr_dim)
        self.embed_wk_day: nn.Module = nn.Embedding(7, wk_day_dim)
        self.embed_hr: nn.Module = nn.Embedding(24, hr_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(6 + 
                      origin_call_dim + 
                      origin_stand_dim + 
                      taxi_id_dim + 
                      year_dim + 
                      wk_of_yr_dim +
                      wk_day_dim + 
                      hr_dim
                      ,1000),
            nn.ReLU(),
            nn.Dropout(0.5)
            nn.Linear(1000, 1000),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1000, 800),
            nn.ReLU(),
            nn.Linear(800, 1)
        )

    def forward(self, input: torch.Tensor):
        origin_call = self.embed_origin_call(input[:, 0].to(dtype=torch.int32))
        origin_stand = self.embed_origin_stand(input[:, 1].to(dtype=torch.int32))
        taxi_id = self.embed_taxi_id(input[:, 2].to(dtype=torch.int32))
        year = self.embed_year(input[:, 8].to(dtype=torch.int32))
        wk_of_year = self.embed_wk_of_yr(input[:, 9].to(dtype=torch.int32))
        wk_day = self.embed_wk_day(input[:, 10].to(dtype=torch.int32))
        hr = self.embed_hr(input[:, 11].to(dtype=torch.int32))
        input = torch.cat((origin_call, 
                           origin_stand, 
                           taxi_id, 
                           input[:, 3:8],
                           year,
                           wk_of_year,
                           wk_day,
                           hr,
                           input[:, 12:]
                           ), dim=1).to(dtype=torch.float32)
        input = self.feed_forward(input.to(dtype=torch.float32))
        return input

model = TravelRegressor().to(device)

In [16]:
model.eval()
model(train_set[0][0].unsqueeze(0).to(device)).size()
# train_set[0][0].unsqueeze(0).size()

torch.Size([1, 1])

In [17]:
def weight_mse(pred, act):
    denom = torch.sum(act)
    return torch.sum(act * ((pred - act)**2 / len(act))) / denom

In [18]:
lossfn = nn.MSELoss(reduction="mean")
# lossfn = weight_mse
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [19]:
def validate():
    model.eval()
    loss = torch.tensor([0]).to(device, dtype=torch.float32)
    with torch.no_grad():
        for i, (entry, target) in enumerate(valid_load):
            entry = entry.to(device, dtype=torch.float32)
            target = target.to(device, dtype=torch.float32)
            preds = model(entry)
            loss += lossfn(preds, target)

    return loss /(i + 1)

In [27]:
rmse_loss = []
def train(num_iter: int):
    for epoch in range(num_iter):
        for i, (entry, target) in enumerate(train_load):
            model.train()
            entry = entry.to(device, dtype=torch.float32)
            target = target.to(device, dtype=torch.float32)
            preds = model(entry)

            optimizer.zero_grad()
            loss = lossfn(preds, target) # or weighted mse
            loss.backward()
            optimizer.step()
            if (i + 1) % 500 == 0 or i == 0:
                print(f"[Epoch: {epoch + 1}]\t[Iter: {i + 1}]\t[RMSE: {torch.sqrt(loss)}]")
                print(f"Prediction std dev: {torch.std(preds).item()}")

            rmse_loss.append(torch.sqrt(loss).item())

        valid_loss = validate()
        print(f"[RMSE Validation: {torch.sqrt(valid_loss).item()}]")

In [30]:
train(10)

[Epoch: 1]	[Iter: 1]	[RMSE: 386.3883056640625]
Prediction std dev: 126.34185791015625


KeyboardInterrupt: 

In [32]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(range(0, len(rmse_loss)), rmse_loss)
plt.xlabel("# of 500 iteration's")
plt.ylabel("RMSE Loss")
plt.show()

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


390886.15625

In [31]:
model.eval()
with torch.no_grad():
    test = torch.from_numpy(np.loadtxt("./data/embed_testvf.csv", skiprows=1, dtype=np.float32, delimiter=",")).to(device)
    out = model(test)
    torch.set_printoptions(threshold=10000, sci_mode=False)
    print(out)

tensor([[  793.8265],
        [  774.1790],
        [  786.9352],
        [  793.9023],
        [  761.6528],
        [  796.7250],
        [  776.7170],
        [  820.7263],
        [  772.2891],
        [  778.4208],
        [  762.7768],
        [  777.7084],
        [  815.2253],
        [  798.7531],
        [  769.3024],
        [  798.6263],
        [  808.4692],
        [  809.5801],
        [  795.8856],
        [  789.0472],
        [  782.7125],
        [  786.3989],
        [  776.8250],
        [  818.6525],
        [  796.7206],
        [  827.8629],
        [  755.8573],
        [  768.5643],
        [  778.2201],
        [  769.8325],
        [  769.6036],
        [  787.7539],
        [  777.0521],
        [  775.5562],
        [  823.1422],
        [  774.0112],
        [  795.5021],
        [  771.3828],
        [  790.1435],
        [  777.5578],
        [  798.2370],
        [  775.2573],
        [  789.4103],
        [  831.9680],
        [  792.9907],
        [ 