In [1]:
from Indexed_Dataset import *
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import random
from torch.utils.data import WeightedRandomSampler

In [2]:
torch.manual_seed(42)
np.random.seed(42)
# torch.use_deterministic_algorithms(True)
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f9a7802ded0>

In [3]:
train_arr = np.loadtxt("../data/combined_trainvf_trig.csv", dtype=np.float32, delimiter=",", skiprows=1)
mean = train_arr[:, -1].mean()
std = train_arr[:, -1].std()
# train_arr = train_arr[train_arr[:, -1] < mean + 5 * std]
# train_arr = train_arr[train_arr[:, -1] > 50]
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

# sampler = WeightedRandomSampler(weights=weights, num_samples=train_arr.shape[0], replacement=True)

train_set = Indexed_Dataset(arr=train_arr)
train_load = DataLoader(dataset=train_set,
                        batch_size=32,
#                         sampler=sampler,
                        shuffle=True,
                        num_workers=8,
                        worker_init_fn=seed_worker,
                        generator=g,)

valid_set = Indexed_Dataset(arr=valid_arr)
valid_load = DataLoader(dataset=valid_set,
                        batch_size=32,
                        # num_workers=8,
                        shuffle=True,
                        worker_init_fn=seed_worker,
                        generator=g,)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
class TravelRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        origin_call_dim = 20
        origin_stand_dim = 5
        taxi_id_dim = 10
        self.embed_origin_call: nn.Module = nn.Embedding(29027, origin_call_dim, padding_idx=0)
        self.embed_origin_stand: nn.Module = nn.Embedding(64, origin_stand_dim)
        self.embed_taxi_id: nn.Module = nn.Embedding(448, taxi_id_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(22 + origin_call_dim + origin_stand_dim + taxi_id_dim, 500),
            nn.ReLU(),
            nn.Linear(500, 50),
            nn.ReLU(),
            nn.Linear(50, 1),
        )

    def forward(self, input: torch.Tensor):
        origin_call = self.embed_origin_call(input[:, 0].to(dtype=torch.int32))
        origin_stand = self.embed_origin_stand(input[:, 1].to(dtype=torch.int32))
        taxi_id = self.embed_taxi_id(input[:, 2].to(dtype=torch.int32))
        input = torch.cat((origin_call, origin_stand, taxi_id, input[:, 3:]), dim=1).to(dtype=torch.float32)
        input = self.feed_forward(input)
        return input

model = TravelRegressor().to(device)

In [13]:
model.eval()
model(train_set[0][0].unsqueeze(0).to(device)).size()
# train_set[0][0].unsqueeze(0).size()

torch.Size([1, 1])

In [14]:
train_set[0][1]

tensor([315.])

In [20]:
mse = nn.MSELoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
def weight_mse(pred, act):
    denom = torch.sum(act)
    return torch.sum(act * ((pred - act)**2 / len(act))) / denom

In [21]:
def validate():
    model.eval()
    loss = torch.tensor([0]).to(device, dtype=torch.float32)
    with torch.no_grad():
        for i, (entry, target) in enumerate(valid_load):
            entry = entry.to(device, dtype=torch.float32)
            target = target.to(device, dtype=torch.float32)
            preds = model(entry)
            loss += mse(preds, target)

    return loss /(i + 1)

In [22]:
rmse_loss = []
valid_loss = []
def train(num_iter: int):
    for epoch in range(num_iter):
        rolling_loss = 0
        entries = 0
        for i, (entry, target) in enumerate(train_load):
            model.train()
            entry = entry.to(device, dtype=torch.float32)
            target = target.to(device, dtype=torch.float32)
            preds = model(entry)

            optimizer.zero_grad()
            loss = mse(preds, target) # or weighted mse
            loss.backward()
            optimizer.step()
            rolling_loss += mse(preds, target) * entry.size(0)
            entries += entry.size(0)
            if (i + 1) % 500 == 0 or i == 0:
                rolling_loss /= entries
                print(f"[Epoch: {epoch + 1}]\t[Iter: {i + 1}]\t[RMSE: {torch.sqrt(rolling_loss)}]\t[STD: {torch.std(preds).item()}]")
                rmse_loss.append(torch.sqrt(rolling_loss).item())
#                 valid_loss.append(torch.sqrt(validate()).item())
                entries = 0
                rolling_loss = 0


In [23]:
train(10)

[Epoch: 1]	[Iter: 1]	[RMSE: 415.27630615234375]	[STD: 4.039961338043213]
[Epoch: 1]	[Iter: 500]	[RMSE: 698.8008422851562]	[STD: 10.465325355529785]
[Epoch: 1]	[Iter: 1000]	[RMSE: 673.4603881835938]	[STD: 19.266572952270508]
[Epoch: 1]	[Iter: 1500]	[RMSE: 689.7603149414062]	[STD: 41.54059600830078]
[Epoch: 1]	[Iter: 2000]	[RMSE: 706.648681640625]	[STD: 64.16667175292969]
[Epoch: 1]	[Iter: 2500]	[RMSE: 706.2863159179688]	[STD: 68.70936584472656]
[Epoch: 1]	[Iter: 3000]	[RMSE: 660.0432739257812]	[STD: 80.0284194946289]
[Epoch: 1]	[Iter: 3500]	[RMSE: 698.544677734375]	[STD: 126.25977325439453]
[Epoch: 1]	[Iter: 4000]	[RMSE: 660.4996948242188]	[STD: 93.74070739746094]
[Epoch: 1]	[Iter: 4500]	[RMSE: 618.5896606445312]	[STD: 145.86888122558594]
[Epoch: 1]	[Iter: 5000]	[RMSE: 710.1400756835938]	[STD: 104.73110961914062]
[Epoch: 1]	[Iter: 5500]	[RMSE: 570.0415649414062]	[STD: 96.78145599365234]
[Epoch: 1]	[Iter: 6000]	[RMSE: 752.9943237304688]	[STD: 203.1099853515625]
[Epoch: 1]	[Iter: 6500]	[R

KeyboardInterrupt: 

In [None]:
train(5)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(rmse_loss, label="training")
plt.plot(valid_loss, label="validation")
plt.xlabel("Iteration's")
plt.ylabel("RMSE Loss")
plt.legend()
plt.show()

In [None]:
torch.sqrt(validate()).item()

### Milestone part 3 getting top 10 data points with greatest training loss

In [None]:
# train_tensor = torch.from_numpy(train_arr).to(device)

In [None]:
# torch.cuda.empty_cache()
# import gc
# # del variables
# gc.collect()
# # Ran into CUDA out of memory :(

In [None]:
# eval_load = DataLoader(dataset=train_set,
#                         batch_size=32,
# #                         sampler=sampler,
#                         shuffle=False, # I just disabled shuffling
#                         num_workers=8,
#                         worker_init_fn=seed_worker,
#                         generator=g,)
# model.eval()
# full_loss = np.zeros(len(train_set))
# total = 0
# batchsz = 0
# with torch.no_grad():
#     for i, (entry, target) in enumerate(eval_load):
#         entry = entry.to(device, dtype=torch.float32)
#         target = target.to(device, dtype=torch.float32)
#         preds = model(entry)
#         batchsz = target.size()[0]
#         full_loss[total:total + batchsz] = np.sqrt((preds.cpu().numpy() - target.cpu().numpy())**2)[:, 0]
#         total += target.size()[0]

In [None]:
# order = np.argsort(full_loss)
# print(order[-10:]) # 10 worst loss data points
# print(full_loss[order[-10:]])
# print(train_arr[order[-10:], 0]) # indices in original dataset

In [None]:
# max(train_arr[:, 0])

In [None]:
# eval_arr[train_arr[:, 0].astype(np.int32), 0]

In [None]:
# train_arr.shape

In [None]:
# eval_arr = np.loadtxt("../data/no_coord_train.csv", dtype=np.float32, delimiter=",", skiprows=1)
# eval_set = Indexed_Dataset(arr=eval_arr)
# eval_load = DataLoader(dataset=eval_set,
#                         batch_size=32,
# #                         sampler=sampler,
#                         shuffle=False, # I just disabled shuffling
#                         num_workers=8,
#                         worker_init_fn=seed_worker,
#                         generator=g,)
# model.eval()
# full_loss = np.zeros(len(eval_set))
# total = 0
# batchsz = 0
# with torch.no_grad():
#     for i, (entry, target) in enumerate(eval_load):
#         entry = entry.to(device, dtype=torch.float32)
#         target = target.to(device, dtype=torch.float32)
#         preds = model(entry)
#         batchsz = target.size()[0]
#         full_loss[total:total + batchsz] = np.sqrt((preds.cpu().numpy() - target.cpu().numpy())**2)[:, 0]
#         total += target.size()[0]


In [None]:
# order = np.argsort(full_loss)
# print(len(order))
# print(full_loss[order[-10:]])
# print(order[-10:]) # 10 worst loss data points
# eval_arr[order[-10:], 0] # indices in original dataset

In [None]:
# model.eval()
# with torch.no_grad():
#     test = torch.from_numpy(np.loadtxt("../data/no_coord_test.csv", skiprows=1, dtype=np.float32, delimiter=",")).to(device)
#     out = model(test)
#     torch.set_printoptions(threshold=10000, sci_mode=False)
#     df_pred = pd.read_csv("../data/sampleSubmission.csv")
#     df_pred["TRAVEL_TIME"] = out.cpu().numpy()
#     df_pred.to_csv("NN_nocoord_weighted_prune.csv")
