In [1]:
!pip install lightgbm



In [2]:
# Load the uploaded dataset
import pandas as pd
import numpy as np

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('/content/tour_packages_lightgbm.csv')
df.head()

Unnamed: 0,destination,duration_days,num_people,hotel_star_rating,transport_type,season,guide_included,activities_count,price
0,Dubai,14,1,5,Flight,Off-season,Yes,5,39510
1,London,9,2,4,Train,Off-season,Yes,5,31093
2,Bali,11,2,3,Bus,Peak,No,2,31199
3,London,10,4,5,Train,Off-season,No,4,35445
4,London,14,2,3,Bus,Off-season,Yes,2,31964


In [3]:
df.shape

(100, 9)

In [4]:
# 1. Encode categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['destination', 'transport_type', 'season', 'guide_included'], drop_first=True)
df_encoded

Unnamed: 0,duration_days,num_people,hotel_star_rating,activities_count,price,destination_Dubai,destination_Goa,destination_London,destination_Paris,transport_type_Flight,transport_type_Train,season_Off-season,season_Peak,guide_included_Yes
0,14,1,5,5,39510,True,False,False,False,True,False,True,False,True
1,9,2,4,5,31093,False,False,True,False,False,True,True,False,True
2,11,2,3,2,31199,False,False,False,False,False,False,False,True,False
3,10,4,5,4,35445,False,False,True,False,False,True,True,False,False
4,14,2,3,2,31964,False,False,True,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,3,3,1,21919,False,False,False,False,False,True,True,False,False
96,3,1,3,3,19648,False,False,True,False,True,False,False,True,True
97,10,1,5,1,27534,False,True,False,False,False,False,True,False,True
98,3,3,4,5,27352,False,True,False,False,True,False,False,True,False


In [5]:
# 2. Split into features and target
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]

In [6]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 4. Create LightGBM datasets
# LightGBM has its own optimized data structure, lgb.Dataset,
# which is specifically designed for faster and more efficient training.
# lgb.Dataset stores data in a compressed binary format, which is much lighter than pandas DataFrames.
# It train models with less RAM usage, especially on large datasets.

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# reference = train_data: During training, LightGBM builds histograms of features for efficient splitting.
# When validating, LightGBM needs to use the same histogram structure.
# By setting reference=train_data, it borrows that structure instead of rebuilding it.
# It’s not mandatory but it is recommended especially when:
#    You're using categorical features
#    You're training with large datasets.
#    You want maximum efficiency.


In [8]:
# 5. Set parameters
params = {
    'objective': 'regression', # What kind of problem? (regression/classification)
    'metric': 'rmse', # How to evaluate performance (e.g., RMSE, MAE, AUC)
    'learning_rate': 0.05, # Controls step size during training (small = better accuracy, slower)
    'num_leaves': 31, # Controls model complexity (more leaves = more flexible)
    'verbose': -1
}

In [15]:
# 6. Train model
#model = lgb.train(params, train_data, num_boost_round=200, valid_sets=[valid_data], early_stopping_rounds=10, verbose_eval=False)
model = lgb.train(
    params,
    train_data, # This is your training dataset
    num_boost_round=200, # This means LightGBM will train up to 200 trees (boosting rounds).
                         # However, with early stopping, training might stop before 200 rounds if no improvement is seen.
    valid_sets=[valid_data], # A list of validation datasets that LightGBM will evaluate after each boosting round.
    callbacks=[     # This list includes functions that control dynamic training behavior like logging and early stopping.
        early_stopping(stopping_rounds=10), # If the validation score doesn’t improve for 10 consecutive rounds, training stops early.
                                            # Prevents overfitting and saves time.
                                            # The model will roll back to the best iteration
        log_evaluation(period=0) # Controls how often LightGBM prints evaluation logs.
                                 # period=0 → disables logging completely.
                                 # set period=10 to print every 10 rounds
    ]
)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 2522.57


In [16]:
# 7. Predict
y_pred = model.predict(X_test)

In [17]:
# 8. Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

evaluation_results = {
    "RMSE": rmse,
    "MAE": mae,
    "R² Score": r2
}

evaluation_results

{'RMSE': np.float64(2522.567945160272),
 'MAE': 2176.343805796941,
 'R² Score': 0.7770992321253174}