In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
from torch.utils import data
from dataset import*
from config import settings
from torch.utils.tensorboard import SummaryWriter
from model import*
from train_test import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler


## load training data

In [2]:
transformers = {
        'year': MinMaxScaler(), # Normalize
        'weekday': None,
        'time': StandardScaler(), # Standardlize
        'weather': None
    }
df, transformers = load_data('train', total_sample=1000, random_sample=settings.totalN, scaling_transformers=transformers)

loaded csv file shape: (1000, 8)
setting time info...
setting geo info...
counting net fare...


In [7]:
df.describe()

Unnamed: 0,fare_amount,passenger_count,year,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,hour,is_holiday,temperature,weathercode,distance,total_fixed_fees,net_fare
count,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0
mean,11.593365,1.597046,0.466245,0.131857,0.155063,0.140295,0.158228,0.14557,0.130802,0.138186,0.019515,0.293249,11.698418,7.614979,2.10693,3.349684,8.243681
std,9.397605,1.218509,0.305274,0.338513,0.362156,0.347477,0.365147,0.35286,0.337361,0.345277,0.980635,0.455492,10.456101,17.771293,2.310894,0.411464,9.420634
min,3.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.096872,0.0,-17.7,0.0,0.000166,2.5,0.1
25%,6.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.691985,0.0,3.7,0.0,0.805291,3.0,2.6
50%,8.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088508,0.0,12.15,1.0,1.359922,3.3,5.3
75%,13.3,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.869001,1.0,20.4,3.0,2.538643,3.5,9.9
max,66.3,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.493395,1.0,37.9,75.0,24.529855,4.3,63.8


In [8]:
print(df.shape)

(948, 24)


In [9]:
df.columns

Index(['fare_amount', 'passenger_count', 'year', 'Sunday', 'Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'hour', 'is_holiday',
       'temperature', 'weathercode', 'distance', 'from_JKF', 'to_JKF',
       'from_LGA', 'to_LGA', 'to_EWR', 'from_Manhattan', 'to_Manhattan',
       'total_fixed_fees', 'net_fare'],
      dtype='object')

## Training and validation data loader

In [4]:
BATCH_SIZE = 64
train_dataset = DataFolder(split='train', df=df, transformers=transformers)
train_dataloader = data.DataLoader(dataset=train_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  drop_last=False,
                                  num_workers=4)

In [None]:
valid_dataset = DataFolder(split='valid', df=df, transformers=transformers)
valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

## Mode select val or train

In [None]:
Mode_train = True

## Device

In [None]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


## Model select

In [None]:
model_name = 'FCNN32 with BN'
save_model_name = './model/'+ model_name +'.pth'
loss_filename = './loss curve/' + model_name + '.txt'

In [None]:
train_dataset.features

array([[2, 2015, 0, ..., True, 4.3, 6.7],
       [6, 2014, 0, ..., True, 3.0, 2.0],
       [1, 2012, 0, ..., True, 4.0, 5.699999999999999],
       ...,
       [1, 2014, 0, ..., False, 3.5, 12.0],
       [2, 2010, 0, ..., True, 3.5, 2.2],
       [2, 2014, 0, ..., True, 4.0, 6.0]], dtype=object)

In [None]:
input_dim = np.size(train_dataset.features, 1)
print(f'feature dimension = {input_dim}')
if Mode_train:
    model = FCNN(input_dim=input_dim).to(device) 
else:
    model = FCNN(input_dim=input_dim)
    model.load_state_dict(torch.load(save_model_name))
    model = model.to(device)
print(model)

feature dimension = 23
FCNN(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=23, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=32, bias=True)
    (3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=32, bias=True)
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=1, bias=True)
  )
)


## Loss function and optimizer

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
# optimizer2 = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0, momentum=0.9)

## Training epoch and stop condition

In [None]:
trained = False

In [None]:
writer = SummaryWriter('./logs/' + model_name)

In [None]:
if Mode_train:
    epochs = 50
    if not trained:
        loss_record = [1e6, 1e6, 1e6, 1e6, 1e6]
    for epoch in range(epochs):
        train_loss = train(train_dataloader, model, loss_fn, optimizer, writer=writer, record_batches=200)
        val_loss = val(valid_dataloader, model, loss_fn)
        writer.add_scalar("loss/training", train_loss, epoch+1)
        writer.add_scalar("loss/validation", val_loss, epoch+1)
        print(f"Epoch {epoch + 1:2d}: Loss = {train_loss:.4f}") 
        print(f'valid loss = {val_loss:.4f}')
        loss_record.append(train_loss)
        # if train_loss > sum(loss_record[-5:])/5*1.05 or train_loss < 0.1:
        #     print('Early stop!')
        #     break

    if not trained:        
        del loss_record[0:5]
    trained = True    
    print("Done!")

100%|██████████| 15/15 [00:23<00:00,  1.54s/it]


KeyboardInterrupt: 

## Plot training loss curve

In [None]:
import matplotlib.pyplot as plt

def plot_loss_curve(loss_list):
    plt.plot(loss_list)
    plt.title('Loss Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
if Mode_train:
    plot_loss_curve(loss_record)
else:
    print('Auto load loss curve')
    with open(loss_filename) as fh:
        s = fh.readline()
        L = s[1:-1].split(', ')
        loss_record = [float(x) for x in L]        
    plot_loss_curve(loss_record)

## Evaluate

In [None]:
#final_train_loss = val(train_dataloader, model, loss_fn)
final_val_loss = val(valid_dataloader, model, loss_fn)
print(f'final valid loss = {final_val_loss:.4f}')

100%|██████████| 1/1 [00:00<00:00, 584.41it/s]

final valid loss = 1.2435





## Save model

In [None]:
if Mode_train:
    torch.save(model.state_dict(), save_model_name)
    with open(loss_filename, 'w') as fh:
        fh.writelines(str(loss_record))   

## Output test result

In [5]:
test_dataset = DataFolder(split='test', transformers=transformers)
test_dataloader = data.DataLoader(dataset=test_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

loaded test csv file shape: (9914, 7)
setting time info...
setting geo info...
counting fixed fee...


In [6]:
df_test = pd.DataFrame(test_dataset.key_list, columns=["key"])

In [7]:
df_test.describe()

Unnamed: 0,key
count,9914
unique,9914
top,2015-01-27 13:08:24.0000002
freq,1


In [None]:


predictions = test(test_dataloader, model)

df_test["fare_amount"] = predictions+df_test["total_fixed_fees"]
df_
# 將dataframe保存為CSV文件
df_test.to_csv("predictions.csv", index=False)

  0%|          | 0/155 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x22 and 23x32)