In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
from torch.utils import data
from dataset import*
from config import settings
from model import*
from train_test import *


ModuleNotFoundError: No module named 'seaborn'

## load training data

In [2]:
df = load_data('train', random_sample=settings.totalN)

loaded csv file shape: (55423856, 8)


In [3]:
print(df.head())

   fare_amount  passenger_count      year  Sunday  Monday  Tuesday  Wednesday   
0          4.5                1  0.000000   False   False    False      False  \
1         16.9                1  0.166667    True   False    False      False   
2          5.7                2  0.333333   False   False     True      False   
3          7.7                1  0.500000   False   False    False      False   
4          5.3                1  0.166667    True   False    False      False   

   Thursday  Friday  Saturday      hour  is_holiday  distance  from_JKF   
0     False   False      True  0.739130           0  0.640487     False  \
1     False   False     False  0.695652           0  5.250670     False   
2     False   False     False  0.000000           0  0.863411     False   
3      True   False     False  0.173913           1  1.739386     False   
4     False   False     False  0.304348           0  1.242218     False   

   to_JKF  from_LGA  to_LGA  to_EWR  from_Manhattan  to_Manhat

In [4]:
print(df.shape)

(53838850, 20)


## Training and validation data loader

In [5]:
BATCH_SIZE = 128
train_dataset = DataFolder(split='train', df=df)
train_dataloader = data.DataLoader(dataset=train_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  drop_last=False,
                                  num_workers=0)

In [6]:
valid_dataset = DataFolder(split='valid', df=df)
valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

## Mode select val or train

In [11]:
Mode_train = True

## Device

In [8]:
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda:4 device


## Model select

In [23]:
class FCNN(nn.Module):
    def __init__(self, input_dim):
        super(FCNN, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),   
            nn.ReLU(),
            nn.Linear(512, 512),   
            nn.ReLU(),
            nn.Linear(512, 512),   
            nn.ReLU(),             
            nn.Linear(512, 1),   
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [24]:
save_model_name = './model/FCNN512t512t512.pth'
loss_filename = './loss curve/FCNN512t512t512.txt'

In [25]:
input_dim = np.size(train_dataset.features, 1)
print(f'feature dimension = {input_dim}')
if Mode_train:
    model = FCNN(input_dim=input_dim).to(device) 
else:
    model = FCNN(input_dim=input_dim)
    model.load_state_dict(torch.load(save_model_name))
    model = model.to(device)

feature dimension = 19


## Loss function and optimizer

In [26]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

## Training epoch and stop condition

In [27]:
trained = False

In [None]:
if Mode_train:
    epochs = 1
    if not trained:
        loss_record = [1e6, 1e6, 1e6, 1e6, 1e6]
    for epoch in range(epochs):
        train_loss = train(train_dataloader, model, loss_fn, optimizer)
        val_loss = val(valid_dataloader, model, loss_fn)
        print(f"Epoch {epoch + 1:2d}: Loss = {train_loss:.4f}") 
        print(f'valid loss = {val_loss:.4f}')
        loss_record.append(train_loss)
        # if train_loss > sum(loss_record[-5:])/5*1.05 or train_loss < 0.1:
        #     print('Early stop!')
        #     break

    if not trained:        
        del loss_record[0:5]
    trained = True    
    print("Done!")

  2%|███▍                                                                                                                                                               | 7215/336493 [01:19<55:55, 98.13it/s]

## Plot training loss curve

In [None]:
import matplotlib.pyplot as plt

def plot_loss_curve(loss_list):
    plt.plot(loss_list)
    plt.title('Loss Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
if Mode_train:
    plot_loss_curve(loss_record)
else:
    print('Auto load loss curve')
    with open(loss_filename) as fh:
        s = fh.readline()
        L = s[1:-1].split(', ')
        loss_record = [float(x) for x in L]        
    plot_loss_curve(loss_record)

## Evaluate

In [35]:
#final_train_loss = val(train_dataloader, model, loss_fn)
final_val_loss = val(valid_dataloader, model, loss_fn)
print(f'final valid loss = {final_val_loss:.4f}')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84124/84124 [05:10<00:00, 271.07it/s]

final valid loss = 3.6969





## Save model

In [None]:
if Mode_train:
    torch.save(model.state_dict(), save_model_name)
    with open(loss_filename, 'w') as fh:
        fh.writelines(str(loss_record))   

## Output test result

In [None]:
test_dataset = DataFolder(split='test')
test_dataloader = data.DataLoader(dataset=test_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

In [None]:
df_test = pd.DataFrame(test_dataset.key_list, columns=["key"])

predictions = test(test_dataloader, model)

df_test["fare_amount"] = predictions

# 將dataframe保存為CSV文件
df_test.to_csv("predictions.csv", index=False)