In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Subset
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_float32_matmul_precision('high')

class PandasDataset(Dataset):
    def __init__(self, dataframe, cat_cols, target_col):
        self.categorical = torch.tensor(dataframe[cat_cols].values, dtype=torch.float32)
        #self.continuous = torch.tensor(dataframe[cont_cols].values, dtype=torch.float32)
        self.targets = torch.tensor(dataframe[target_col].values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        return self.categorical[index], self.targets[index]

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.linear1 = nn.Linear(10, 128)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, 64)
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.linear3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.7)
        
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.batch_norm1(x)
        x = F.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.batch_norm2(x)
        x = F.relu(self.linear3(x))
        return x.squeeze()

data = pd.read_csv('processed_train.csv')

In [None]:
one_hot = OneHotEncoder(categories=[['A', 'B', 'C']])
encoded_data = one_hot.fit_transform(data[['CALL_TYPE']]).toarray()
encoded_df = pd.DataFrame(encoded_data, columns=one_hot.get_feature_names_out(['CALL_TYPE']))
data = pd.concat([encoded_df, data], axis=1)
data = data.drop('CALL_TYPE', axis=1)

In [None]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_data = train_data.reset_index()
val_data = val_data.reset_index()

categorical = [
    'CALL_TYPE_A',
    'CALL_TYPE_B',
    'CALL_TYPE_C',
    'ORIGIN_CALL',
    'ORIGIN_STAND',
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]
train = PandasDataset(train_data, cat_cols=categorical, target_col='TRAVEL_TIME')
val = PandasDataset(val_data, cat_cols=categorical, target_col='TRAVEL_TIME')
train_dataloader = DataLoader(train, batch_size=512, shuffle=True, num_workers=16)
val_dataloader = DataLoader(val, batch_size=512, shuffle=False, num_workers=16)

In [None]:
model = MyModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.003)
loss_fn = nn.MSELoss()

In [None]:
model.eval()

train_loss_list = []
init_train_loss = 0.0
with torch.no_grad(), tqdm(train_dataloader, desc=f"Initial Training Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_train_loss += loss.item()
        progress.set_postfix({"Train Loss": loss.item()})

init_train_loss /= len(train_dataloader)
train_loss_list.append(init_train_loss)

val_loss_list = []
init_val_loss = 0.0
with torch.no_grad(), tqdm(val_dataloader, desc=f"Initial Validation Loss") as progress:
    for inputs, targets in progress:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        loss = torch.sqrt(loss_fn(outputs, targets))
        
        init_val_loss += loss.item()
        progress.set_postfix({"Val Loss": loss.item()})

init_val_loss /= len(val_dataloader)
val_loss_list.append(init_val_loss)

In [None]:
max_epochs = 4
for epoch in range(max_epochs):
    model.train()
    train_loss = 0.0
    with tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Training") as progress:
        for batch_idx, (inputs, targets) in enumerate(progress):
            inputs = inputs.to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
    
            loss.backward()
            optimizer.step()
    
            train_loss += loss.item()
            progress.set_postfix({"Train Loss": loss.item()})

    train_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad(), tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{max_epochs} - Validation") as progress:
        for inputs, targets in progress:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            loss = torch.sqrt(loss_fn(outputs, targets))
            
            val_loss += loss.item()
            progress.set_postfix({"Val Loss": loss.item()})

    val_loss /= len(val_dataloader)

    print(f"Epoch {epoch+1}/{max_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)

In [None]:
x = range(len(train_loss_list))
plt.plot(x, train_loss_list, label='Training Loss')
plt.plot(x, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('RMSE Loss')
plt.title('Loss Per Epoch - Model A')
plt.legend()
#plt.savefig('loss.png', dpi=600, bbox_inches='tight')
plt.show()


In [None]:
#torch.save(model.state_dict(), './A.pt')

In [None]:
class EvalPandasDataset(Dataset):
    def __init__(self, dataframe, cat_cols):
        self.categorical = torch.tensor(dataframe[cat_cols].values, dtype=torch.float32)
        self.ids = dataframe['TRIP_ID']

    def __len__(self):
        return len(self.categorical)

    def __getitem__(self, index):
        return self.ids[index], self.categorical[index]

In [None]:
test_data = pd.read_csv('processed_test.csv')

encoded_data = one_hot.fit_transform(test_data[['CALL_TYPE']]).toarray()
encoded_df = pd.DataFrame(encoded_data, columns=one_hot.get_feature_names_out(['CALL_TYPE']))
test_data = pd.concat([encoded_df, test_data], axis=1)
test_data = test_data.drop('CALL_TYPE', axis=1)

test = EvalPandasDataset(test_data, categorical)
test_dataloader = DataLoader(test, batch_size=64, shuffle=False, num_workers=8)

In [None]:
model.eval()
create_array = True
with torch.no_grad(), tqdm(test_dataloader, desc=f"Testing") as progress:
    for id, inputs in progress:
        inputs = inputs.to(device)
        outputs = model(inputs)

        if create_array:
            ids = id
            score = outputs
            create_array = False
        else:
            ids = ids + id
            score = torch.cat((score, outputs))

In [None]:
df = pd.DataFrame({'TRIP_ID': ids, 'TRAVEL_TIME': score.cpu()})
def extract_id(value):
    return int(value[1:])
df_sorted = df.iloc[df['TRIP_ID'].map(extract_id).argsort()]

In [None]:
df_sorted

In [None]:
df_sorted.to_csv('submission.csv', index=False)

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad)