In [224]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

In [225]:
train_file_path = 'train_data.csv'
train_data = pd.read_csv(train_file_path)

# train_data['total_minutes'] = train_data['sin_hour'] * 60 + train_data['Minute']
# train_data['sin_hour'] = np.sin(2 * np.pi * train_data['sin_hour'] / 24)


print(train_data)

     sin_hour  stopOrder  nextStopOrder  time_diff  weekDay  Minute
0           6          5              6        770        3      10
1           5          2              3        738        4      59
2           6          5              6        737        2      10
3           6          5              6        678        4      11
4           6          6              7        629        5      17
..        ...        ...            ...        ...      ...     ...
312         5          2              3         81        3      58
313         6          7              8         79        7      31
314         6          2              3         77        2       0
315         6          6              7         64        3      17
316         6          6              7         63        3      16

[317 rows x 6 columns]


In [226]:
X = train_data[['sin_hour', 'stopOrder', 'nextStopOrder','Minute','weekDay']]
y = train_data['time_diff']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

print(X_train)
print(y_train)
# print(train_data['time_diff'].describe())


     sin_hour  stopOrder  nextStopOrder  Minute  weekDay
201         6          7              8      27        2
209         6          4              5      10        2
213         6          6              7      29        3
128         6          8              9      26        6
300         5          2              3      59        2
..        ...        ...            ...     ...      ...
66          6          6              7      16        6
53          6          3              4       2        2
79          6          6              7      14        3
280         6          7              8      23        4
8           6          3              4       1        2

[253 rows x 5 columns]
201    148
209    139
213    136
128    328
300     92
      ... 
66     463
53     478
79     428
280    103
8      553
Name: time_diff, Length: 253, dtype: int64


In [227]:
lgb_model = lgb.LGBMRegressor(
    n_estimators=100,
    max_depth=30,
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=1.0,
    learning_rate=0.05,
    # objective='regression_l1'
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse'
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 253, number of used features: 4
[LightGBM] [Info] Start training from score 285.913043


In [228]:
y_pred = lgb_model.predict(X_test)

print(X_test)


results_df = pd.DataFrame({
    'Actual': y_test,     # Actual target values
    'Predicted': y_pred    # Predicted values
})

# Optionally, reset the index for better readability
results_df = results_df.reset_index(drop=True)

# Display the first 10 rows
print(results_df.head(20))
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"MSE:{mse}, RMSE: {rmse}, MAE: {mae}, R²: {r2}")

     sin_hour  stopOrder  nextStopOrder  Minute  weekDay
162         6          5              6      12        2
203         6          7              8      24        6
78          6          6              7      16        4
1           5          2              3      59        4
139         6          5              6      12        2
..        ...        ...            ...     ...      ...
75          6          6              7      17        2
295         5          2              3      59        2
241         6          7              8      21        6
104         6          8              9      27        3
64          6          3              4       1        2

[64 rows x 5 columns]
    Actual   Predicted
0      239  287.073359
1      148  161.832116
2      431  437.383464
3      738  103.130929
4      290  287.073359
5      114  421.136236
6      100  137.298283
7      300  361.117192
8      186  170.595451
9      520  496.166848
10     243  344.933827
11     560  506.2