In [2]:
import pandas as pd
import numpy as np

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# XGBoost
import xgboost as xgb

# LightGBM
import lightgbm as lgb

# CatBoost
from catboost import CatBoostRegressor

# For encoding categorical variables
from sklearn.preprocessing import LabelEncoder

In [34]:
# Load the dataset
df = pd.read_csv('train.csv')

# Display the first few rows
print(df.head())

   id        date  bus_route_id in_out  station_code  station_name  latitude  \
0   0  2019-09-01       4270000     시외           344         제주썬호텔  33.48990   
1   1  2019-09-01       4270000     시외           357          한라병원  33.48944   
2   2  2019-09-01       4270000     시외           432          정존마을  33.48181   
3   3  2019-09-01       4270000     시내          1579  제주국제공항(600번)  33.50577   
4   4  2019-09-01       4270000     시내          1646      중문관광단지입구  33.25579   

   longitude  6~7_ride  7~8_ride  ...  9~10_ride  10~11_ride  11~12_ride  \
0  126.49373       0.0       1.0  ...        5.0         2.0         6.0   
1  126.48508       1.0       4.0  ...        2.0         5.0         6.0   
2  126.47352       1.0       1.0  ...        2.0         0.0         0.0   
3  126.49252       0.0      17.0  ...       26.0        14.0        16.0   
4  126.41260       0.0       0.0  ...        0.0         0.0         0.0   

   6~7_takeoff  7~8_takeoff  8~9_takeoff  9~10_takeoff  10~11_

In [35]:
# Convert 'date' to datetime
df_encoded['date'] = pd.to_datetime(df_encoded['date'])

# Check the date range
print(f"Minimum date: {df_encoded['date'].min()}")
print(f"Maximum date: {df_encoded['date'].max()}")


Minimum date: 2019-09-01 00:00:00
Maximum date: 2019-09-30 00:00:00


In [36]:
# Check for missing values
print(df.isnull().sum())

# For simplicity, we'll fill numerical missing values with 0 and categorical with mode
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Fill numerical columns with 0
df[numerical_cols] = df[numerical_cols].fillna(0)

# Fill categorical columns with mode
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
18~20_ride       0
dtype: int64


In [37]:
# List of categorical features
categorical_features = ['in_out', 'station_name', 'date']

# Encode 'in_out' and 'station_name' using LabelEncoder
le_in_out = LabelEncoder()
le_station_name = LabelEncoder()
le_date = LabelEncoder()

df['in_out_enc'] = le_in_out.fit_transform(df['in_out'])
df['station_name_enc'] = le_station_name.fit_transform(df['station_name'])
df['date_enc'] = le_date.fit_transform(df['date'])

# Drop original categorical columns (we'll keep 'date' for time-based splitting)
df_encoded = df.drop(['in_out', 'station_name'], axis=1)

In [38]:
# Convert 'date' to datetime
df_encoded['date'] = pd.to_datetime(df_encoded['date'])

# Check the date range
print(f"Minimum date: {df_encoded['date'].min()}")
print(f"Maximum date: {df_encoded['date'].max()}")
# Adjust the date for splitting
split_date = '2019-09-21'

train_df = df_encoded[df_encoded['date'] < split_date]
test_df = df_encoded[df_encoded['date'] >= split_date]


Minimum date: 2019-09-01 00:00:00
Maximum date: 2019-09-30 00:00:00


In [39]:
df

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,...,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride,in_out_enc,station_name_enc,date_enc
0,0,2019-09-01,4270000,시외,344,제주썬호텔,33.48990,126.49373,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1481,0
1,1,2019-09-01,4270000,시외,357,한라병원,33.48944,126.48508,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1,1822,0
2,2,2019-09-01,4270000,시외,432,정존마을,33.48181,126.47352,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1,1406,0
3,3,2019-09-01,4270000,시내,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0,1431,0
4,4,2019-09-01,4270000,시내,1646,중문관광단지입구,33.25579,126.41260,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1575,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415418,415418,2019-09-30,32820000,시내,1129,한림환승정류장(한림리),33.41437,126.26336,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1843,29
415419,415419,2019-09-30,32820000,시내,1564,제주시외버스터미널,33.49946,126.51479,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1475,29
415420,415420,2019-09-30,32820000,시내,2322,해병부대,33.23100,126.26273,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1878,29
415421,415421,2019-09-30,32820000,시내,3291,애월환승정류장(애월리),33.46483,126.31870,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1093,29


In [40]:
# Define target variables (all ride counts)
target = [col for col in df.columns if '_ride' in col]+[col for col in df.columns if '_takeoff' in col]

# Define features
features = [
    'bus_route_id',
    'station_code',
    'latitude',
    'longitude',
    'in_out_enc',
    'station_name_enc',
]

# Convert 'date' to datetime
df_encoded['date'] = pd.to_datetime(df_encoded['date'])


In [41]:
# Convert 'date' to datetime
df_encoded['date'] = pd.to_datetime(df_encoded['date'])

# Sort by date
df_encoded = df_encoded.sort_values('date')

split_date = '2019-09-21'

train_df = df_encoded[df_encoded['date'] < split_date]
test_df = df_encoded[df_encoded['date'] >= split_date]

# Separate features and target
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


In [42]:
X_train

Unnamed: 0,bus_route_id,station_code,latitude,longitude,in_out_enc,station_name_enc
0,4270000,344,33.48990,126.49373,1,1481
7688,28790000,2459,33.23128,126.29752,0,640
7689,28790000,2468,33.24652,126.33507,0,1934
7690,28790000,2476,33.25053,126.30433,0,360
7691,28820000,2,33.49527,126.45618,0,1167
...,...,...,...,...,...,...
266924,24130000,3435,33.50958,126.50850,0,1242
266930,24180000,154,33.50155,126.51995,0,1239
266923,24130000,3365,33.51627,126.50832,0,1226
266929,24180000,151,33.49979,126.51769,0,1808


In [43]:
# Train XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation metrics
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print("\nXGBoost Model Evaluation:")
print(f"Mean Absolute Error: {mae_xgb:.2f}")
print(f"Root Mean Squared Error: {rmse_xgb:.2f}")



XGBoost Model Evaluation:
Mean Absolute Error: 0.66
Root Mean Squared Error: 1.59


In [47]:
from sklearn.multioutput import MultiOutputRegressor
# Initialize LightGBM regressor
lgb_base = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=100,
    num_leaves=31,
    learning_rate=0.1,
    random_state=42
)

# Wrap with MultiOutputRegressor
lgb_model = MultiOutputRegressor(lgb_base)

# Train the model
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)

# Evaluation metrics
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))

print("\nLightGBM Multi-Output Model Evaluation:")
print(f"Mean Absolute Error: {mae_lgb:.2f}")
print(f"Root Mean Squared Error: {rmse_lgb:.2f}")



LightGBM Multi-Output Model Evaluation:
Mean Absolute Error: 0.67
Root Mean Squared Error: 1.58


In [49]:
# Identify categorical feature indices
cat_features_indices = [features.index('in_out_enc'), features.index('station_name_enc')]

# Initialize CatBoost regressor
cat_model = CatBoostRegressor(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    eval_metric='MultiRMSE',
    random_seed=42,
    verbose=10,
    loss_function='MultiRMSE'  # Multi-target regression
)

# Train the model
cat_model.fit(
    X_train,
    y_train,
    cat_features=cat_features_indices,
    eval_set=(X_test, y_test),
    early_stopping_rounds=10
)

# Predict and evaluate
y_pred_cat = cat_model.predict(X_test)

# Evaluation metrics
mae_cat = mean_absolute_error(y_test, y_pred_cat)
rmse_cat = np.sqrt(mean_squared_error(y_test, y_pred_cat))

print("\nCatBoost Multi-Output Model Evaluation:")
print(f"Mean Absolute Error: {mae_cat:.2f}")
print(f"Root Mean Squared Error: {rmse_cat:.2f}")


0:	learn: 7.4487619	test: 7.3643059	best: 7.3643059 (0)	total: 198ms	remaining: 19.6s
10:	learn: 6.9849359	test: 6.8995177	best: 6.8995177 (10)	total: 1.84s	remaining: 14.9s
20:	learn: 6.8038429	test: 6.7188224	best: 6.7188224 (20)	total: 3.45s	remaining: 13s
30:	learn: 6.6906578	test: 6.6025336	best: 6.6025336 (30)	total: 5.09s	remaining: 11.3s
40:	learn: 6.6232225	test: 6.5400027	best: 6.5400027 (40)	total: 6.71s	remaining: 9.66s
50:	learn: 6.5284640	test: 6.4572491	best: 6.4572491 (50)	total: 8.32s	remaining: 8s
60:	learn: 6.4717790	test: 6.4038275	best: 6.4038275 (60)	total: 9.95s	remaining: 6.36s
70:	learn: 6.4271160	test: 6.3657118	best: 6.3657118 (70)	total: 11.6s	remaining: 4.73s
80:	learn: 6.3887815	test: 6.3308025	best: 6.3308025 (80)	total: 13.2s	remaining: 3.1s
90:	learn: 6.3572794	test: 6.3024037	best: 6.3024037 (90)	total: 14.9s	remaining: 1.47s
99:	learn: 6.3311656	test: 6.2786990	best: 6.2786990 (99)	total: 16.3s	remaining: 0us

bestTest = 6.27869905
bestIteration = 99
