### 1. Import all the necessary libraries

In [1]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim
import category_encoders as ce
import math
%matplotlib inline

In [2]:
pip install category_encoders

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### 2. Loading the Dataset

In [3]:
df_tr = pd.read_csv("../data/train.csv")
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


### 3. Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```


In [4]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 1, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

### 4. Test and Extract the features: (Original Call + HR + WK + MON + TAXI_ID)

In [5]:
# Verify our guesses of the patterns of TAXI_ID such that all the IDs are in the form of 
# 20000xxx by substracting all the numbers by 20000000 and check if they are between the 
# range [0,1000).
def TAXI_ID_pattern_checker(x):
    # Test if the only last 3 digits of the TRIP_ID exhibit a pattern
    for idx in range(len(x)):
        if (x[idx]-20000000) < 0 or (x[idx]-20000000) >= 1000:
            return False
    return True

if TAXI_ID_pattern_checker(df_tr["TAXI_ID"]):
    print("Pattern is found!")

# Note that the only last three digits of the TAXI_ID are nonzero.
def parse_TAXI_ID(x):
    return (x % pow(10,3)) 

df_tr["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

Pattern is found!


In [6]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday(), dt.minute

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html

df_tr[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [7]:
# Trying to find relationship between "ORIGIN_CALL" and "LEN"
type_A = df_tr[df_tr["CALL_TYPE"]=="A"]

mean, std = type_A["LEN"].mean(), type_A["LEN"].std()
# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed_A = type_A[type_A["LEN"] < mean + outlier_threshold * std]
# df_trimmed_A = df_trimmed_A[df_trimmed_A?["ORIGIN_CALL"]
# plt.scatter(df_trimmed_A["ORIGIN_CALL"], df_trimmed_A["LEN"],s=5, alpha=0.5)
# plt.xlabel("ORIGIN_CALL")
# plt.ylabel("LEN")
# plt.title("Relationship between ORIGIN_CALL and LEN")
# plt.show()

print("The correlation coefficient between ORIGIN CALL and LEN is {}".format(df_trimmed_A["ORIGIN_CALL"].corr(df_trimmed_A["LEN"])))
print("The correlation coefficient between HR and LEN is {}".format(df_trimmed_A["HR"].corr(df_trimmed_A["LEN"])))
print("The correlation coefficient between WK and LEN is {}".format(df_trimmed_A["WK"].corr(df_trimmed_A["LEN"])))
print("The correlation coefficient between MON and LEN is {}".format(df_trimmed_A["MON"].corr(df_trimmed_A["LEN"])))
print("The correlation coefficient between TAXI_ID and LEN is {}".format(df_trimmed_A["Unique_TAXI_ID"].corr(df_trimmed_A["LEN"])))

NameError: name 'outlier_threshold' is not defined

### 5. Data Encoding

In [15]:
outlier_threshold = 3

type_A = df_tr[df_tr["CALL_TYPE"]=="A"]

mean, std = type_A["LEN"].mean(), type_A["LEN"].std()
# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed_A = type_A[type_A["LEN"] < mean + outlier_threshold * std]

# print(type_A["ORIGIN_CALL"].mean())
# print(type_A["ORIGIN_CALL"].std())

all_features = df_trimmed_A[[]]

# Create an instance of BinaryEncoder
binary_encoder_origin = ce.BinaryEncoder(cols=['ORIGIN_CALL'])
binary_encoder_hr = ce.BinaryEncoder(cols=['HR'])
binary_encoder_wk = ce.BinaryEncoder(cols=['WK'])

# Apply binary encoding to the 'ORIGIN_CALL' column
encoded_data_origin = binary_encoder_origin.fit_transform(df_trimmed_A['ORIGIN_CALL'])
encoded_data_hr = binary_encoder_hr.fit_transform(df_trimmed_A['HR'])
encoded_data_wk = binary_encoder_wk.fit_transform(df_trimmed_A['WK'])

# Convert all features into string so that we can apply pd.get_dummies on it
# all_features['WK'] = all_features['WK'].astype(str)
# all_features['HR'] = all_features['HR'].astype(str)


# numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# all_features[numeric_features] = all_features[numeric_features].apply(
#     lambda x: (x - x.mean()) / (x.std()))

# all_features = pd.get_dummies(all_features)

# Concatenate the encoded data with the original DataFrame
all_features = pd.concat([all_features, encoded_data_origin], axis=1)
all_features = pd.concat([all_features, encoded_data_hr], axis=1)
all_features = pd.concat([all_features, encoded_data_wk], axis=1)
# all_features = pd.concat([all_features, encoded_data_mon], axis=1)


print(f'预处理之后数据形状: {all_features.shape}')
print(all_features)

cols = list(all_features.columns.values)
print(cols)


预处理之后数据形状: (360422, 24)
         ORIGIN_CALL_0  ORIGIN_CALL_1  ORIGIN_CALL_2  ORIGIN_CALL_3  \
12                   0              0              0              0   
17                   0              0              0              0   
21                   0              0              0              0   
24                   0              0              0              0   
26                   0              0              0              0   
...                ...            ...            ...            ...   
1710651              0              0              0              0   
1710653              0              0              1              0   
1710656              0              0              0              1   
1710658              0              0              0              0   
1710659              0              1              1              1   

         ORIGIN_CALL_4  ORIGIN_CALL_5  ORIGIN_CALL_6  ORIGIN_CALL_7  \
12                   0              0              0

In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 

label = df_trimmed_A["LEN"]
train_data = all_features

train_features, val_features, train_labels, val_labels = train_test_split(
    train_data, label, test_size=0.2, random_state=42)

# 训练集
train_features = torch.tensor(train_features.values, dtype=torch.float)
# 验证集
val_features = torch.tensor(val_features.values, dtype=torch.float)

train_labels = torch.tensor(train_labels.values, dtype=torch.float)
train_labels = train_labels.unsqueeze(1) 


val_labels = torch.tensor(val_labels.values, dtype=torch.float)
val_labels = val_labels.unsqueeze(1)


print(f'训练集数据: {train_features.shape}')
print(f'训练集label: {train_labels.shape}')
print(f'验证集数据: {val_features.shape}')
print(f'验证集label: {val_labels.shape}')


训练集数据: torch.Size([288337, 24])
训练集label: torch.Size([288337, 1])
验证集数据: torch.Size([72085, 24])
验证集label: torch.Size([72085, 1])


In [17]:
class myDataset:
    def __init__(self, data, label):
        self.data = data
        self.label = label
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        return self.data[idx, :], self.label[idx]

train_dataset = myDataset(train_features, train_labels)
val_dataset = myDataset(val_features, val_labels)

# 变为迭代器
train_iter = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_iter = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=64, shuffle=False, num_workers=4)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### 6. Conversion of Dataset to Dataloader

In [36]:
import torch.nn as nn
import torch.nn.functional as F

# 初始化权重
def _weight_init(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight)
    elif isinstance(m, nn.BatchNorm1d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)
# 网络
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(24, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)  # New layer: fc5 -> fc6
        self.fc6 = nn.Linear(64, 32)   # New layer: fc6 -> fc7
        self.fc7 = nn.Linear(32, 1)    # New layer: fc7 -> output
        self.apply(_weight_init)
        self.apply(_weight_init) # 初始化参数
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = self.fc7(x)
        return x

# 使用rmse作为自定义得分函数，这也是比赛的判定标准
def custom_score(y_true, y_pred):
#     rmse = mean_squared_error(np.log1p(y_true), np.log1p(y_pred), squared=False)
    return math.sqrt(np.mean((np.array(y_pred)-np.array(y_true))*(np.array(y_pred)-np.array(y_true))))

net = Net()
criterion = torch.nn.MSELoss() # 损失函数为MSE
net = net.to(device) # 将网络和损失函数转化为GPU或CPU
criterion = criterion.to(device)
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.001, weight_decay=0)

### 7. Model training and validation

In [37]:
# 这是训练函数，分为train和val
# train时前向传播后向更新参数
# val时只计算损失函数
def train(net, data_iter, phase, criterion, optimizer=None):
    y_true = []
    y_pred = []
    mean_loss = []
    is_grad = True if phase == 'train' else False
    with torch.set_grad_enabled(is_grad):
        net.train()
        for step, (X, y) in enumerate(data_iter):
            X = X.to(device)
            y = y.to(device)
            out = net(X)
            loss = criterion(out, y) # 计算损失
            mean_loss.append(loss.item())
            
            if phase == 'train':
                optimizer.zero_grad() # optimizer 0
                loss.backward() # back propragation
                optimizer.step() # update the paramters

            # 将每一个step的结果加入列表，最后统一生产这一个epoch的指标  
            # 添加预测值和真实类标签
            y_pred.extend(out.detach().cpu().squeeze().numpy().tolist())
            y_true.extend(y.detach().cpu().squeeze().numpy().tolist())

    # 全量样本的rmse和平均loss
    rmse = custom_score(y_true, y_pred)
    mean_loss = np.mean(mean_loss)
    # 保留4位小数
    rmse = np.round(rmse, 4)
    mean_loss = np.round(mean_loss, 4)
    return mean_loss, rmse

In [38]:
from tqdm import tqdm
from datetime import datetime
from colorama import Fore, Back

epochs = 100
loss_list_A = []
print(f'{datetime.now()} 开始训练结束...')
for epoch in tqdm(range(epochs)):
    train_mean_loss, train_score = train(net=net, 
                                         data_iter=train_iter, 
                                         phase='train', 
                                         criterion=criterion, 
                                         optimizer=optimizer)
    
    val_mean_loss, val_score = train(net=net, 
                                     data_iter=train_iter, 
                                     phase='val', 
                                     criterion=criterion, 
                                     optimizer=None)
    print(Fore.CYAN + Back.BLACK, end='')
    tqdm.write(f'Epoch: {epoch} Train loss: {train_mean_loss} Val loss: {val_mean_loss}', end=' ')
    tqdm.write(f'Train score: {train_score} Val score: {val_score}')
    loss_list_A.append(train_score)

print(f'{datetime.now()} 训练结束...')

2023-05-24 05:57:05.177620 开始训练结束...


  5%|▌         | 1/20 [00:38<12:13, 38.61s/it]

[36m[40mEpoch: 0 Train loss: 137393.7854 Val loss: 132943.3197 Train score: 370.6716 Val score: 364.6035


 10%|█         | 2/20 [01:16<11:22, 37.94s/it]

[36m[40mEpoch: 1 Train loss: 132129.3673 Val loss: 130229.5046 Train score: 363.5059 Val score: 360.8925


 15%|█▌        | 3/20 [01:56<11:02, 38.99s/it]

[36m[40mEpoch: 2 Train loss: 130983.4608 Val loss: 128785.4369 Train score: 361.9143 Val score: 358.8689


 20%|██        | 4/20 [02:36<10:32, 39.55s/it]

[36m[40mEpoch: 3 Train loss: 129942.8082 Val loss: 128151.618 Train score: 360.486 Val score: 357.9833


 25%|██▌       | 5/20 [03:17<09:57, 39.82s/it]

[36m[40mEpoch: 4 Train loss: 128951.4759 Val loss: 127249.6522 Train score: 359.0986 Val score: 356.7325


 30%|███       | 6/20 [03:57<09:20, 40.05s/it]

[36m[40mEpoch: 5 Train loss: 128164.2291 Val loss: 126584.4988 Train score: 358.0107 Val score: 355.7879


 35%|███▌      | 7/20 [04:37<08:40, 40.06s/it]

[36m[40mEpoch: 6 Train loss: 127434.1917 Val loss: 126147.3719 Train score: 356.9815 Val score: 355.1816


 40%|████      | 8/20 [05:17<08:01, 40.13s/it]

[36m[40mEpoch: 7 Train loss: 126704.3725 Val loss: 125946.9352 Train score: 355.9467 Val score: 354.8919


 45%|████▌     | 9/20 [05:57<07:20, 40.08s/it]

[36m[40mEpoch: 8 Train loss: 125977.3948 Val loss: 124348.1749 Train score: 354.9458 Val score: 352.6089


 50%|█████     | 10/20 [06:41<06:50, 41.09s/it]

[36m[40mEpoch: 9 Train loss: 125190.7924 Val loss: 124419.4175 Train score: 353.8288 Val score: 352.7377


 55%|█████▌    | 11/20 [07:25<06:17, 41.96s/it]

[36m[40mEpoch: 10 Train loss: 124504.149 Val loss: 122404.9915 Train score: 352.8554 Val score: 349.8551


 60%|██████    | 12/20 [08:10<05:44, 43.06s/it]

[36m[40mEpoch: 11 Train loss: 123798.5433 Val loss: 122760.2858 Train score: 351.8407 Val score: 350.3853


 65%|██████▌   | 13/20 [08:55<05:05, 43.69s/it]

[36m[40mEpoch: 12 Train loss: 123021.6545 Val loss: 122499.2305 Train score: 350.7522 Val score: 349.9998


 70%|███████   | 14/20 [09:40<04:23, 43.93s/it]

[36m[40mEpoch: 13 Train loss: 122219.515 Val loss: 121847.6542 Train score: 349.6163 Val score: 349.0863


 75%|███████▌  | 15/20 [10:24<03:40, 44.13s/it]

[36m[40mEpoch: 14 Train loss: 121377.6543 Val loss: 119274.6597 Train score: 348.4106 Val score: 345.3718


 80%|████████  | 16/20 [11:09<02:57, 44.31s/it]

[36m[40mEpoch: 15 Train loss: 120537.5716 Val loss: 118370.337 Train score: 347.1951 Val score: 344.0599


 85%|████████▌ | 17/20 [11:54<02:13, 44.40s/it]

[36m[40mEpoch: 16 Train loss: 119853.1826 Val loss: 119143.2205 Train score: 346.2112 Val score: 345.1843


 90%|█████████ | 18/20 [12:38<01:28, 44.27s/it]

[36m[40mEpoch: 17 Train loss: 119074.9343 Val loss: 116685.913 Train score: 345.0842 Val score: 341.5795


 95%|█████████▌| 19/20 [13:22<00:44, 44.19s/it]

[36m[40mEpoch: 18 Train loss: 118274.8149 Val loss: 117469.8591 Train score: 343.9178 Val score: 342.7558


100%|██████████| 20/20 [14:06<00:00, 42.31s/it]

[36m[40mEpoch: 19 Train loss: 117425.5637 Val loss: 115109.56 Train score: 342.6404 Val score: 339.2509
2023-05-24 06:11:11.403460 训练结束...





In [None]:
with open('loss_list_A.txt', 'w') as file:
    file.write(','.join(str(element) for element in loss_list_A))

### 8. Test set validation

In [41]:
df_tr_test = pd.read_csv("../test/test.csv")
df_tr_test.head()

df_tr_test["LEN"] = df_tr_test["POLYLINE"].apply(polyline_to_trip_duration)
df_tr_test[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_tr_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

df_tr_test = df_tr_test[df_tr_test["CALL_TYPE"]=="A"]

all_features_test = df_tr_test[[]]

# Create an instance of BinaryEncoder
binary_encoder_origin_test = ce.BinaryEncoder(cols=['ORIGIN_CALL'])
binary_encoder_hr_test = ce.BinaryEncoder(cols=['HR'])
binary_encoder_wk_test = ce.BinaryEncoder(cols=['WK'])
# binary_encoder_mon_test = ce.BinaryEncoder(cols=['MON'])


# Apply binary encoding to the 'ORIGIN_CALL' column
encoded_data_origin_test = binary_encoder_origin_test.fit_transform(df_tr_test['ORIGIN_CALL'])
encoded_data_hr_test = binary_encoder_hr_test.fit_transform(df_tr_test['HR'])
encoded_data_wk_test = binary_encoder_wk_test.fit_transform(df_tr_test['WK'])
# encoded_data_mon_test = binary_encoder_mon_test.fit_transform(df_tr_test['MON'])

# Concatenate the encoded data with the original DataFrame
all_features_test = pd.concat([all_features_test, encoded_data_origin_test], axis=1)
all_features_test = pd.concat([all_features_test, encoded_data_hr_test], axis=1)
all_features_test = pd.concat([all_features_test, encoded_data_wk_test], axis=1)
# all_features_test = pd.concat([all_features_test, encoded_data_mon_test], axis=1)

missing_columns = set(all_features.columns)-set(all_features_test.columns)

for column in missing_columns:
    all_features_test[column] = 0

# print(f'预处理之后数据形状: {all_features_test.shape}')
# print(all_features_test)

cols = list(all_features_test.columns.values)
# print(cols)

prediction = net(torch.tensor(all_features_test.values, dtype=torch.float).to(device))

print("The RMSE loss against test set is:")
custom_score(prediction.tolist(),df_tr_test["LEN"].tolist())

The RMSE loss against test set is:


703.9716069295529

### 9. Model Saving

In [42]:
# print("Optimizer's state_dict:")
# for var_name in optimizer.state_dict():
#     print(var_name, "\t", optimizer.state_dict()[var_name])
# val_features = val_features.to(device)
torch.save(net.state_dict(), '../model/modelA.pth')