In [None]:
# Install lightgbm if not already installed
pip install --upgrade lightgbm

In [37]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np


In [20]:
df = pd.read_csv('./Data/combined_expedia_results.csv')

# Basic info
print(f"Total rows in combined dataset: {len(df)}")
print("\nMissing values per column:\n", df.isnull().sum())

# Summary of numerical columns
print("\nNumerical column stats:\n", df.describe())

# Check unique values in categorical columns
categorical_cols = ['TTT', 'Neighborhood', 'Breakfast', 'Free Cancellation', 'search_group']
for col in categorical_cols:
    print(f"\nUnique values in {col}: {df[col].unique()}")


Total rows in combined dataset: 44518

Missing values per column:
 Snapshot Date         0
TTT                   0
LOS                   0
Hotel Name            0
Price                 0
Rating                0
Reviews               0
Neighborhood          0
Breakfast             0
Free Cancellation     0
Rooms Left            0
Check-in              0
Check-out             0
search_group          0
hotel_name_encoded    0
dtype: int64

Numerical column stats:
                 TTT           LOS    Hotel Name         Price        Rating  \
count  44518.000000  44518.000000  44518.000000  44518.000000  44518.000000   
mean      15.442989      2.983018    202.279527    964.970731      2.715037   
std        8.617443      1.410342    118.064074    528.421080      1.402482   
min        1.000000      1.000000      0.000000    106.000000     -1.000000   
25%        8.000000      2.000000    112.000000    528.000000      2.000000   
50%       15.000000      3.000000    188.000000    916.00000

In [21]:
# Handle missing or placeholder values (-1)
df['Rating'] = df['Rating'].replace(-1, df['Rating'].median())  # Replace -1 with median rating
df['Neighborhood'] = df['Neighborhood'].replace(-1, df['Neighborhood'].mode()[0])  # Replace -1 with most common neighborhood
df['Rooms Left'] = df['Rooms Left'].replace(-1, 0)  # Assume -1 means "No rooms left"

# Normalize Price, Reviews, and Rooms Left
scaler = MinMaxScaler()
df[['Price', 'Reviews', 'Rooms Left']] = scaler.fit_transform(df[['Price', 'Reviews', 'Rooms Left']])

print("Data cleaning and normalization complete!")
df.head()


Data cleaning and normalization complete!


Unnamed: 0,Snapshot Date,TTT,LOS,Hotel Name,Price,Rating,Reviews,Neighborhood,Breakfast,Free Cancellation,Rooms Left,Check-in,Check-out,search_group,hotel_name_encoded
0,2025-03-14,1,1,56,0.0968,3.0,0.379182,14,0,0,0.005139,2025-03-15,2025-03-16,0,56
1,2025-03-14,1,1,234,0.095585,1.0,0.169821,16,1,0,0.0,2025-03-15,2025-03-16,1,234
2,2025-03-14,1,1,261,0.214257,2.0,0.474823,14,0,0,0.0,2025-03-15,2025-03-16,2,261
3,2025-03-14,1,1,283,0.065614,4.0,0.169314,14,1,0,0.005139,2025-03-15,2025-03-16,3,283
4,2025-03-14,1,1,197,0.060348,2.0,0.411626,14,1,0,0.0,2025-03-15,2025-03-16,4,197


In [22]:
# Split train (even TTTs) and test (odd TTTs)
train_df = df[df['TTT'] % 2 == 0].copy()
test_df = df[df['TTT'] % 2 == 1].copy()

# Shuffle the test set to simulate a realistic ranking challenge
test_df_shuffled = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 22247, Test size: 22271


In [42]:
# Define features for ranking
features = ['Price', 'Rating', 'Reviews', 'Neighborhood', 'Breakfast', 'Free Cancellation', 'Rooms Left']

# Define group key for ranking (Snapshot Date, TTT, LOS)
train_df['group_key'] = train_df['Snapshot Date'].astype(str) + "_" + train_df['TTT'].astype(str) + "_" + train_df['LOS'].astype(str)
test_df['group_key'] = test_df['Snapshot Date'].astype(str) + "_" + test_df['TTT'].astype(str) + "_" + test_df['LOS'].astype(str)

# 🚀 Fix: Assign natural ranking labels without MinMaxScaler 🚀
def assign_ranking_labels(df):
    df['ranking_position'] = df.groupby('group_key')['Price'].rank(method="dense", ascending=True) - 1
    return df

train_df = assign_ranking_labels(train_df)
test_df = assign_ranking_labels(test_df)

# Convert ranking position to integer
train_df['ranking_position'] = train_df['ranking_position'].astype(int)
test_df['ranking_position'] = test_df['ranking_position'].astype(int)

# 🚀 Debugging: Check label distribution
print("Max ranking per group (train):\n", train_df.groupby('group_key')['ranking_position'].max().describe())
print("Max ranking per group (test):\n", test_df.groupby('group_key')['ranking_position'].max().describe())

# Shuffle training data while preserving groups
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract training data
X_train = train_df[features]
y_train = train_df['ranking_position']

X_test = test_df[features]
y_test = test_df['ranking_position']

# Get number of hotels per group (for ranking loss calculation)
train_groups = train_df.groupby('group_key').size().tolist()
test_groups = test_df.groupby('group_key').size().tolist()

# Debugging: Check if train_groups and test_groups match expected values
print("Max hotels per group in train:", max(train_groups))
print("Max hotels per group in test:", max(test_groups))

# LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train, group=train_groups)
valid_data = lgb.Dataset(X_test, label=y_test, group=test_groups, reference=train_data)

# 🚀 Force LightGBM to accept natural ranking range 🚀
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'boosting_type': 'gbdt',
    'verbose': -1,  # Silence logs
}

# Train the ranking model
ranker = lgb.train(
    params, 
    train_data, 
    valid_sets=[valid_data],  # Validation set
    num_boost_round=100
)

print("Ranking Model Trained Successfully!")

Max ranking per group (train):
 count    225.000000
mean      82.328889
std        6.658662
min       58.000000
25%       80.000000
50%       84.000000
75%       87.000000
max       94.000000
Name: ranking_position, dtype: float64
Max ranking per group (test):
 count    225.000000
mean      82.182222
std        6.757116
min       61.000000
25%       79.000000
50%       84.000000
75%       87.000000
max       94.000000
Name: ranking_position, dtype: float64
Max hotels per group in train: 100
Max hotels per group in test: 100


LightGBMError: Label 65 is not less than the number of label mappings (31)

In [40]:
# Predict ranking scores for test data
test_df['predicted_score'] = ranker.predict(X_test)

# Rank hotels within each group based on predicted scores
test_df['predicted_rank'] = test_df.groupby('group_key')['predicted_score'].rank(method="first", ascending=False).astype(int)

# 🚀 Debugging: Check ranking distribution
print("Predicted ranking per group (test):\n", test_df.groupby('group_key')['predicted_rank'].max().describe())

# Sort test set by original ranking position for evaluation
test_df = test_df.sort_values(by=['group_key', 'ranking_position']).reset_index(drop=True)

# Sort test set by predicted ranking for comparison
test_df_sorted = test_df.sort_values(by=['group_key', 'predicted_rank']).reset_index(drop=True)


Predicted ranking per group (test):
 count    225.000000
mean      98.982222
std        3.274583
min       76.000000
25%      100.000000
50%      100.000000
75%      100.000000
max      100.000000
Name: predicted_rank, dtype: float64


In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Compute ranking errors
mse = mean_squared_error(test_df['ranking_position'], test_df['predicted_rank'])
mae = mean_absolute_error(test_df['ranking_position'], test_df['predicted_rank'])

print(f"Ranking Error (MSE): {mse:.4f}")
print(f"Ranking Error (MAE): {mae:.4f}")


Ranking Error (MSE): 2474.8267
Ranking Error (MAE): 41.3242
