In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Subset
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_float32_matmul_precision('high')

class PandasDataset(Dataset):
    def __init__(self, dataframe, cat_cols, target_col):
        self.categorical = torch.tensor(dataframe[cat_cols].values, dtype=torch.int32)
        #self.continuous = torch.tensor(dataframe[cont_cols].values, dtype=torch.float32)
        self.targets = torch.tensor(dataframe[target_col].values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        return self.categorical[index], self.targets[index]

class MyModel(nn.Module):
    def __init__(self, embed):
        super(MyModel, self).__init__()

        self.embeddings = nn.ModuleList(
            [nn.Embedding(in_dim, out_dim) for in_dim, out_dim in embed]
        )
        
        em_dim = sum(embed.embedding_dim for embed in self.embeddings)
        self.linear1 = nn.Linear(em_dim, 128)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, 64)
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.linear3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = [embed(x[:, i]) for i, embed in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.batch_norm1(x)
        x = F.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.batch_norm2(x)
        x = self.linear3(x)
        return x.squeeze()

data = pd.read_csv('processed_train.csv')

In [None]:
A = data[data['CALL_TYPE'] == 'A']
A_train, A_val = train_test_split(A, test_size=0.2, random_state=42)
A_train = A_train.reset_index()
A_val = A_val.reset_index()
print(len(A_val))

categorical_A = [
    'ORIGIN_CALL',
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]
train_A = PandasDataset(A_train, cat_cols=categorical_A, target_col='TRAVEL_TIME')
val_A = PandasDataset(A_val, cat_cols=categorical_A, target_col='TRAVEL_TIME')
A_train_dataloader = DataLoader(train_A, batch_size=128, shuffle=True, num_workers=16)
A_val_dataloader = DataLoader(val_A, batch_size=128, shuffle=False, num_workers=16)

In [None]:
embedding_dim_A = [
    (56481, 50),
    (443, 50),
    (96, 48),
    (7, 4),
    (52, 26),
    (3, 2)
]
model_A = MyModel(embedding_dim_A).to(device)
optimizer = optim.Adam(model_A.parameters(), lr=0.007)
loss_fn = nn.MSELoss()

In [None]:
model_A.eval()

train_loss_list = []
init_train_loss = 0.0
with torch.no_grad(), tqdm(A_train_dataloader, desc=f"Initial Training Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_A(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_train_loss += loss.item()
        progress.set_postfix({"Train Loss": loss.item()})

init_train_loss /= len(A_train_dataloader)
train_loss_list.append(init_train_loss)

val_loss_list = []
init_val_loss = 0.0
with torch.no_grad(), tqdm(A_val_dataloader, desc=f"Initial Validation Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_A(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_val_loss += loss.item()
        progress.set_postfix({"Val Loss": loss.item()})

init_val_loss /= len(A_val_dataloader)
val_loss_list.append(init_val_loss)

In [None]:
max_epochs = 16
for epoch in range(max_epochs):
    start = time.time()
    model_A.train()
    train_loss = 0.0
    with tqdm(A_train_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Training") as progress:
        for batch_idx, (inputs, targets) in enumerate(progress):
            inputs = inputs.to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model_A(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
    
            loss.backward()
            optimizer.step()
    
            train_loss += loss.item()
            progress.set_postfix({"Train Loss": loss.item()})

    train_loss /= len(A_train_dataloader)

    model_A.eval()
    val_loss = 0.0
    with torch.no_grad(), tqdm(A_val_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Validation") as progress:
        for inputs, targets in progress:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model_A(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
            
            val_loss += loss.item()
            progress.set_postfix({"Val Loss": loss.item()})

    val_loss /= len(A_val_dataloader)

    print(f"Epoch {epoch+1}/{max_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Time: {(time.time()-start):.2f}")
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    save = input('Save state at current epoch?: (y/n) ')
    if save == 'y':
        torch.save(model_A.state_dict(), './A.pt')

In [None]:
x = range(len(train_loss_list))
plt.plot(x, train_loss_list, label='Training Loss')
plt.plot(x, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('RMSE Loss')
plt.title('Loss Per Epoch - Model A')
plt.legend()
#plt.savefig('loss.png', dpi=600, bbox_inches='tight')
plt.show()


In [None]:
B = data[data['CALL_TYPE'] == 'B']
B_train, B_val = train_test_split(B, test_size=0.2, random_state=42)
B_train = B_train.reset_index()
B_val = B_val.reset_index()
print(len(B_val))

categorical_B = [
    'ORIGIN_STAND',
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]
train_B = PandasDataset(B_train, cat_cols=categorical_B, target_col='TRAVEL_TIME')
val_B = PandasDataset(B_val, cat_cols=categorical_B, target_col='TRAVEL_TIME')
B_train_dataloader = DataLoader(train_B, batch_size=128, shuffle=True, num_workers=16)
B_val_dataloader = DataLoader(val_B, batch_size=128, shuffle=False, num_workers=16)

In [None]:
embedding_dim_B = [
    (64, 32),
    (443, 50),
    (96, 48),
    (7, 4),
    (52, 26),
    (3, 2)
]
model_B = MyModel(embedding_dim_B).to(device)
optimizer = optim.Adam(model_B.parameters(), lr=0.007)
loss_fn = nn.MSELoss()

In [None]:
model_B.eval()

train_loss_list = []
init_train_loss = 0.0
with torch.no_grad(), tqdm(B_train_dataloader, desc=f"Initial Training Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_B(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_train_loss += loss.item()
        progress.set_postfix({"Train Loss": loss.item()})

init_train_loss /= len(B_train_dataloader)
train_loss_list.append(init_train_loss)

val_loss_list = []
init_val_loss = 0.0
with torch.no_grad(), tqdm(B_val_dataloader, desc=f"Initial Validation Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_B(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_val_loss += loss.item()
        progress.set_postfix({"Val Loss": loss.item()})

init_val_loss /= len(B_val_dataloader)
val_loss_list.append(init_val_loss)

In [None]:
max_epochs = 16
for epoch in range(max_epochs):
    start = time.time()
    model_B.train()
    train_loss = 0.0
    with tqdm(B_train_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Training") as progress:
        for batch_idx, (inputs, targets) in enumerate(progress):
            inputs = inputs.to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model_B(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
    
            loss.backward()
            optimizer.step()
    
            train_loss += loss.item()
            progress.set_postfix({"Train Loss": loss.item()})

    train_loss /= len(B_train_dataloader)

    model_B.eval()
    val_loss = 0.0
    with torch.no_grad(), tqdm(B_val_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Validation") as progress:
        for inputs, targets in progress:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model_B(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
            
            val_loss += loss.item()
            progress.set_postfix({"Val Loss": loss.item()})

    val_loss /= len(B_val_dataloader)

    print(f"Epoch {epoch+1}/{max_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Time: {(time.time()-start):.2f}")
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    save = input('Save state at current epoch?: (y/n) ')
    if save == 'y':
        torch.save(model_B.state_dict(), './B.pt')

In [None]:
x = range(len(train_loss_list))
plt.plot(x, train_loss_list, label='Training Loss')
plt.plot(x, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('RMSE Loss')
plt.title('Loss Per Epoch - Model B')
plt.legend()
#plt.savefig('loss.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
C = data[data['CALL_TYPE'] == 'C']
C_train, C_val = train_test_split(C, test_size=0.2, random_state=42)
C_train = C_train.reset_index()
C_val = C_val.reset_index()
print(len(C_val))

categorical_C = [
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]
train_C = PandasDataset(C_train, cat_cols=categorical_C, target_col='TRAVEL_TIME')
val_C = PandasDataset(C_val, cat_cols=categorical_C, target_col='TRAVEL_TIME')
C_train_dataloader = DataLoader(train_C, batch_size=120, shuffle=True, num_workers=16)
C_val_dataloader = DataLoader(val_C, batch_size=120, shuffle=False, num_workers=16)

In [None]:
embedding_dim_C = [
    (443, 50),
    (96, 48),
    (7, 4),
    (52, 26),
    (3, 2)
]
model_C = MyModel(embedding_dim_C).to(device)
optimizer = optim.Adam(model_C.parameters(), lr=0.007)
loss_fn = nn.MSELoss()

In [None]:
model_C.eval()

train_loss_list = []
init_train_loss = 0.0
with torch.no_grad(), tqdm(C_train_dataloader, desc=f"Initial Training Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_C(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_train_loss += loss.item()
        progress.set_postfix({"Train Loss": loss.item()})

init_train_loss /= len(C_train_dataloader)
train_loss_list.append(init_train_loss)

val_loss_list = []
init_val_loss = 0.0
with torch.no_grad(), tqdm(C_val_dataloader, desc=f"Initial Validation Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model_C(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_val_loss += loss.item()
        progress.set_postfix({"Val Loss": loss.item()})

init_val_loss /= len(C_val_dataloader)
val_loss_list.append(init_val_loss)

In [None]:
max_epochs = 16
for epoch in range(max_epochs):
    start = time.time()
    model_C.train()
    train_loss = 0.0
    with tqdm(C_train_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Training") as progress:
        for batch_idx, (inputs, targets) in enumerate(progress):
            inputs = inputs.to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model_C(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
    
            loss.backward()
            optimizer.step()
    
            train_loss += loss.item()
            progress.set_postfix({"Train Loss": loss.item()})

    train_loss /= len(C_train_dataloader)

    model_C.eval()
    val_loss = 0.0
    with torch.no_grad(), tqdm(C_val_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Validation") as progress:
        for inputs, targets in progress:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model_C(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
            
            val_loss += loss.item()
            progress.set_postfix({"Val Loss": loss.item()})

    val_loss /= len(C_val_dataloader)

    print(f"Epoch {epoch+1}/{max_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Time: {(time.time()-start):.2f}")
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    save = input('Save state at current epoch?: (y/n) ')
    if save == 'y':
        torch.save(model_C.state_dict(), './C.pt')

In [None]:
x = range(len(train_loss_list))
plt.plot(x, train_loss_list, label='Training Loss')
plt.plot(x, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('RMSE Loss')
plt.title('Loss Per Epoch - Model C')
plt.legend()
#plt.savefig('loss.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
class EvalPandasDataset(Dataset):
    def __init__(self, dataframe, cat_cols):
        self.categorical = torch.tensor(dataframe[cat_cols].values, dtype=torch.int32)
        self.ids = dataframe['TRIP_ID']

    def __len__(self):
        return len(self.categorical)

    def __getitem__(self, index):
        return self.ids[index], self.categorical[index]

In [None]:
test = pd.read_csv('processed_train.csv')
test = test[:10000]
test['TRIP_ID'] = test['TRIP_ID'].astype(str)
test.to_csv('lol.csv', index=False)
A_data_test = test[test['CALL_TYPE'] == 'A'].reset_index()
B_data_test = test[test['CALL_TYPE'] == 'B'].reset_index()
C_data_test = test[test['CALL_TYPE'] == 'C'].reset_index()
A_test = EvalPandasDataset(A_data_test, categorical_A)
B_test = EvalPandasDataset(B_data_test, categorical_B)
C_test = EvalPandasDataset(C_data_test, categorical_C)
A_test_dataloader = DataLoader(A_test, batch_size=64, shuffle=False, num_workers=8)
B_test_dataloader = DataLoader(B_test, batch_size=64, shuffle=False, num_workers=8)
C_test_dataloader = DataLoader(C_test, batch_size=64, shuffle=False, num_workers=8)

In [None]:
test

In [None]:
model_A = MyModel(embedding_dim_A).to(device)
model_A.load_state_dict(torch.load('model_weights/A.pt'))
model_A.eval()
create_array = True
with torch.no_grad(), tqdm(A_test_dataloader, desc=f"A") as progress:
    for id, inputs in progress:
        inputs = inputs.to(device)
        outputs = model_A(inputs)

        if create_array:
            ids = id
            score = outputs
            create_array = False
        else:
            ids = ids + id
            score = torch.cat((score, outputs))

model_B = MyModel(embedding_dim_B).to(device)
model_B.load_state_dict(torch.load('model_weights/B.pt'))
model_B.eval()
with torch.no_grad(), tqdm(B_test_dataloader, desc=f"B") as progress:
    for id, inputs in progress:
        inputs = inputs.to(device)
        outputs = model_B(inputs)

        ids = ids + id
        score = torch.cat((score, outputs))

model_C = MyModel(embedding_dim_C).to(device)
model_C.load_state_dict(torch.load('model_weights/C.pt'))
model_C.eval()
with torch.no_grad(), tqdm(C_test_dataloader, desc=f"C") as progress:
    for id, inputs in progress:
        inputs = inputs.to(device)
        outputs = model_C(inputs)

        ids = ids + id
        score = torch.cat((score, outputs))

In [None]:
df = pd.DataFrame({'TRIP_ID': ids, 'TRAVEL_TIME': score.cpu()})
def extract_id(value):
    return int(value[1:])
df_sorted = df.iloc[df['TRIP_ID'].map(extract_id).argsort()]

In [None]:
df_sorted

In [None]:
df_sorted.to_csv('submission.csv', index=False)

In [None]:
sum(p.numel() for p in model_A.parameters() if p.requires_grad)

In [None]:
sum(p.numel() for p in model_B.parameters() if p.requires_grad)

In [None]:
sum(p.numel() for p in model_C.parameters() if p.requires_grad)