In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
from torch.utils import data
from dataset import*
from config import settings
from torch.utils.tensorboard import SummaryWriter
from model import*
from train_test import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler


## load training data

In [None]:
transformers = {
        'year': MinMaxScaler(), # Normalize
        'weekday': None,
        'time': None, # Standardlize
        'weather': None
    }
df, transformers = load_data('train', total_sample=None, random_sample=settings.totalN, scaling_transformers=transformers)

loaded csv file shape: (55423856, 8)
setting time info...


In [54]:
df.describe()

Unnamed: 0,fare_amount,passenger_count,year,weekday,hour,is_holiday,temperature,weathercode,distance,night,...,downtown_pickup_distance,downtown_dropoff_distance,jfk_pickup_distance,jfk_dropoff_distance,ewr_pickup_distance,ewr_dropoff_distance,lgr_pickup_distance,lgr_dropoff_distance,total_fixed_fees,net_fare
count,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,...,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0,52475110.0
mean,11.47387,1.692577,0.4574819,3.041045,13.48527,0.3038481,12.04517,8.867734,2.121438,0.1963741,...,0.07158603,0.07460905,0.3003075,0.3000146,0.2620599,0.2636354,0.1333274,0.1337482,3.384471,8.089403
std,9.510953,1.307598,0.3109721,1.948616,6.514131,0.4599178,10.47299,19.58844,2.339556,0.3972547,...,0.05073947,0.04973799,0.04554721,0.04180465,0.04914464,0.05046139,0.03895721,0.03961764,0.7001533,9.426395
min,2.51,1.0,0.0,0.0,0.0,0.0,-20.0,0.0,4.442711e-05,0.0,...,4e-07,5.2e-06,2.45555e-05,1.24445e-05,0.000832,0.00143,3.7e-05,6.6e-05,2.5,0.01
25%,6.1,1.0,0.1666667,1.0,9.0,0.0,3.9,0.0,0.8195025,0.0,...,0.03868979,0.0415556,0.2964441,0.2955281,0.230357,0.231512,0.113516,0.113105,3.0,2.7
50%,8.5,1.0,0.5,3.0,14.0,0.0,12.5,1.0,1.377249,0.0,...,0.0652172,0.0677152,0.3059571,0.3058321,0.2576212,0.259063,0.130465,0.129237,3.5,5.1
75%,12.9,2.0,0.6666667,5.0,19.0,1.0,20.5,3.0,2.481446,0.0,...,0.0918332,0.0976982,0.3174131,0.3175491,0.28444,0.288707,0.156411,0.155634,3.5,9.5
max,165.75,6.0,1.0,6.0,23.0,1.0,38.8,75.0,79.74493,1.0,...,2.674394,2.691996,2.520228,2.53783,2.867172,2.884774,2.482172,2.499774,21.3,149.95


In [None]:
print(df.shape)

In [7]:
df.columns

Index(['fare_amount', 'passenger_count', 'year', 'Sunday', 'Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'weekday', 'hour',
       'is_holiday', 'temperature', 'weathercode', 'distance', 'night',
       'late_night', 'from_JKF', 'to_JKF', 'from_LGA', 'to_LGA', 'to_EWR',
       'from_Manhattan', 'to_Manhattan', 'direction', 'latdiff', 'londiff',
       'euclidean', 'manhattan', 'downtown_pickup_distance',
       'downtown_dropoff_distance', 'jfk_pickup_distance',
       'jfk_dropoff_distance', 'ewr_pickup_distance', 'ewr_dropoff_distance',
       'lgr_pickup_distance', 'lgr_dropoff_distance', 'total_fixed_fees',
       'net_fare'],
      dtype='object')

## Training and validation data loader

In [8]:
BATCH_SIZE = 128
train_dataset = DataFolder(split='train', df=df, transformers=transformers)
train_dataloader = data.DataLoader(dataset=train_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  drop_last=False,
                                  num_workers=4)

In [9]:
valid_dataset = DataFolder(split='valid', df=df, transformers=transformers)
valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

## Mode select val or train

In [10]:
Mode_train = True

## Device

In [11]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda:2 device


## Model select

In [12]:
model_name = 'FCNN64_0524_03'
save_model_name = './model/'+ model_name +'.pth'
loss_filename = './loss curve/' + model_name + '.txt'

In [53]:
train_dataset.df.describe()

Unnamed: 0,fare_amount,passenger_count,year,weekday,hour,is_holiday,temperature,weathercode,distance,night,...,downtown_pickup_distance,downtown_dropoff_distance,jfk_pickup_distance,jfk_dropoff_distance,ewr_pickup_distance,ewr_dropoff_distance,lgr_pickup_distance,lgr_dropoff_distance,total_fixed_fees,net_fare
count,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,...,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0,49851350.0
mean,11.47422,1.692666,0.4574878,3.041145,13.48495,0.3038748,12.04442,8.868145,2.121516,0.1963495,...,0.07158792,0.07460821,0.3003051,0.3000143,0.2620616,0.2636339,0.1333275,0.1337484,3.384476,8.089744
std,9.511529,1.307671,0.3109711,1.948608,6.514162,0.4599292,10.47314,19.58917,2.339824,0.3972359,...,0.05074647,0.04973034,0.04555417,0.04179804,0.04915215,0.05045599,0.03896132,0.03961169,0.7004598,9.426847
min,2.51,1.0,0.0,0.0,0.0,0.0,-20.0,0.0,4.442711e-05,0.0,...,4e-07,5.2e-06,2.45555e-05,1.24445e-05,0.000832,0.00143,3.7e-05,6.6e-05,2.5,0.01
25%,6.1,1.0,0.1666667,1.0,9.0,0.0,3.9,0.0,0.8194663,0.0,...,0.0386882,0.0415556,0.2964431,0.2955273,0.230355,0.231512,0.113515,0.113105,3.0,2.7
50%,8.5,1.0,0.5,3.0,14.0,0.0,12.5,1.0,1.377266,0.0,...,0.0652172,0.0677142,0.3059561,0.3058321,0.257622,0.259062,0.130466,0.129238,3.5,5.1
75%,12.9,2.0,0.6666667,5.0,19.0,1.0,20.5,3.0,2.481454,0.0,...,0.0918352,0.09769934,0.3174131,0.3175481,0.284442,0.288708,0.156413,0.155634,3.5,9.5
max,165.75,6.0,1.0,6.0,23.0,1.0,38.8,75.0,79.74493,1.0,...,2.674394,2.691996,2.520228,2.53783,2.867172,2.884774,2.482172,2.499774,21.3,149.95


In [14]:
input_dim = np.size(train_dataset.features, 1)
print(f'feature dimension = {input_dim}')
if Mode_train:
    model = FCNN(input_dim=input_dim).to(device) 
else:
    model = FCNN(input_dim=input_dim)
    model.load_state_dict(torch.load(save_model_name))
    model = model.to(device)
print(model)

feature dimension = 38
FCNN(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=38, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)


## Loss function and optimizer

In [15]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
# optimizer2 = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0, momentum=0.9)

## Training epoch and stop condition

In [16]:
trained = False

In [17]:
writer = SummaryWriter('./logs/' + model_name)

2023-05-25 00:05:03.729663: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


In [18]:
if Mode_train:
    epochs = 1
    if not trained:
        loss_record = [1e6, 1e6, 1e6, 1e6, 1e6]
    for epoch in range(epochs):
        train_loss = train(train_dataloader, model, loss_fn, optimizer, writer=writer, record_batches=200)
        val_loss = val(valid_dataloader, model, loss_fn)
        writer.add_scalar("loss/training", train_loss, epoch+1)
        writer.add_scalar("loss/validation", val_loss, epoch+1)
        print(f"Epoch {epoch + 1:2d}: Loss = {train_loss:.4f}") 
        print(f'valid loss = {val_loss:.4f}')
        loss_record.append(train_loss)
        # if train_loss > sum(loss_record[-5:])/5*1.05 or train_loss < 0.1:
        #     print('Early stop!')
        #     break

    if not trained:        
        del loss_record[0:5]
    trained = True    
    print("Done!")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 389464/389464 [33:05<00:00, 196.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20499/20499 [01:18<00:00, 260.00it/s]

Epoch  1: Loss = 3.2855
valid loss = 3.2075
Done!





## Plot training loss curve

In [None]:
import matplotlib.pyplot as plt

def plot_loss_curve(loss_list):
    plt.plot(loss_list)
    plt.title('Loss Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
if Mode_train:
    plot_loss_curve(loss_record)
else:
    print('Auto load loss curve')
    with open(loss_filename) as fh:
        s = fh.readline()
        L = s[1:-1].split(', ')
        loss_record = [float(x) for x in L]        
    plot_loss_curve(loss_record)

## Evaluate

In [19]:
#final_train_loss = val(train_dataloader, model, loss_fn)
final_val_loss = val(valid_dataloader, model, loss_fn)
print(f'final valid loss = {final_val_loss:.4f}')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20499/20499 [01:18<00:00, 260.04it/s]

final valid loss = 3.2075





## Save model

In [20]:
if Mode_train:
    torch.save(model.state_dict(), save_model_name)
    with open(loss_filename, 'w') as fh:
        fh.writelines(str(loss_record))   

## Output test result

In [55]:
test_dataset = DataFolder(split='test', transformers=transformers)
test_dataset.df.replace(-np.inf, -5, inplace=True)
test_dataloader = data.DataLoader(dataset=test_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False,
                                  drop_last=False,
                                  num_workers=0)

loaded test csv file shape: (9914, 7)
setting time info...
setting geo info...
counting fixed fee...


In [56]:
df_test = pd.DataFrame(test_dataset.key_list, columns=["key"])

In [57]:
test_dataset.df.describe()

Unnamed: 0,passenger_count,year,weekday,hour,is_holiday,temperature,weathercode,distance,night,late_night,...,manhattan,downtown_pickup_distance,downtown_dropoff_distance,jfk_pickup_distance,jfk_dropoff_distance,ewr_pickup_distance,ewr_dropoff_distance,lgr_pickup_distance,lgr_dropoff_distance,total_fixed_fees
count,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,...,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0
mean,1.671273,0.469303,2.852834,13.46742,0.280916,12.900978,9.147065,2.13464,0.200424,0.400746,...,-3.5055,0.072781,0.076354,0.300563,0.300144,0.263208,0.265217,0.13459,0.133983,3.421192
std,1.278747,0.300558,1.994451,6.868584,0.449469,9.928859,20.238913,2.468319,0.400338,0.490074,...,0.911244,0.06545,0.05975,0.05586,0.048643,0.063896,0.060605,0.051362,0.048436,0.675622
min,1.0,0.0,0.0,0.0,0.0,-19.1,0.0,0.0,0.0,0.0,...,-10.724468,0.000607,0.000776,0.000151,0.000129,0.161984,0.002922,0.003212,0.000271,2.5
25%,1.0,0.166667,1.0,8.0,0.0,5.5,0.0,0.806712,0.0,0.0,...,-4.074331,0.037943,0.04297,0.296282,0.295326,0.22975,0.232364,0.113397,0.11286,3.0
50%,1.0,0.5,3.0,15.0,0.0,13.9,1.0,1.377836,0.0,0.0,...,-3.536605,0.064762,0.068687,0.306,0.305828,0.256905,0.25977,0.131216,0.128524,3.5
75%,2.0,0.833333,5.0,19.0,1.0,20.6,3.0,2.513634,0.0,1.0,...,-2.938185,0.092463,0.100171,0.317276,0.317847,0.284851,0.290071,0.157255,0.155339,3.5
max,6.0,1.0,6.0,23.0,1.0,38.3,75.0,62.13466,1.0,1.0,...,0.392018,2.015245,1.997942,1.861079,1.843776,2.208023,2.19072,1.823023,1.80572,21.0


In [58]:
predictions = test(test_dataloader, model)

df_test["fare_amount"] = predictions
# df_test = df_test.fillna(0)
df_test["fare_amount"] += test_dataset.df["total_fixed_fees"]
df_test.loc[df_test['fare_amount'] < 2.8, 'fare_amount'] = 2.8
print(df_test.isnull().sum().sum())
#df_test = df_test.fillna(10)
# 將dataframe保存為CSV文件
df_test.to_csv("predictions.csv", index=False)
print('output complete!')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [00:00<00:00, 265.79it/s]


85
output complete!


In [52]:
df_test.describe()

Unnamed: 0,fare_amount
count,9829.0
mean,11.373303
std,8.742736
min,3.566613
25%,6.366226
50%,8.53318
75%,12.549
max,110.725224


In [50]:
print(test_dataset.df.values[279])

['2010-09-05 22:31:32.0000002' 1 0.16666666666668561 False False False
 False False True False 6 22 1 24.3 0.0 0.0 0 1 False False False False
 False False False 0.0 0.0 0.0 -10.0 -10.0 0.11262739999999383
 0.11262739999999383 0.4067941110999911 0.4067941110999911
 0.22339500000000356 0.22339500000000356 0.19318299999999056
 0.19318299999999056 3.5]
