In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import pi

# ML and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


In [2]:
pd.set_option('display.max_rows', None)    # Show all rows
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.width', None)       # No limit on width
pd.set_option('display.max_colwidth', None) # Show full column contents.


In [3]:
# Data set loading.

transaction_df = pd.read_csv("datasets/transactions.csv")
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")
holiday_df = pd.read_csv("datasets/holiday.csv")


In [4]:
transaction_df.head(), train_df.head(), test_df.head()


(          doj         doi  srcid  destid    srcid_region   destid_region  \
 0  2023-03-01  2023-01-30     45      46       Karnataka      Tamil Nadu   
 1  2023-03-01  2023-01-30     46      45      Tamil Nadu       Karnataka   
 2  2023-03-01  2023-01-30     45      47       Karnataka  Andhra Pradesh   
 3  2023-03-01  2023-01-30     47      45  Andhra Pradesh       Karnataka   
 4  2023-03-01  2023-01-30     46       9      Tamil Nadu      Tamil Nadu   
 
   srcid_tier destid_tier  cumsum_seatcount  cumsum_searchcount  dbd  
 0     Tier 1      Tier 1               8.0                76.0   30  
 1     Tier 1      Tier 1               8.0                70.0   30  
 2     Tier 1      Tier 1               4.0               142.0   30  
 3     Tier 1      Tier 1               0.0                68.0   30  
 4     Tier 1       Tier2               9.0               162.0   30  ,
           doj  srcid  destid  final_seatcount
 0  2023-03-01     45      46           2838.0
 1  2023-03-01 

In [5]:
transaction_df.shape, train_df.shape, test_df.shape

((2266100, 11), (67200, 4), (5900, 4))

In [6]:
train_merged = pd.merge(
    train_df,
    transaction_df,
    on=['doj', 'srcid', 'destid'],
    how='left'
)



In [7]:
train_merged['doj'] = pd.to_datetime(train_merged['doj'], format='%Y-%m-%d')
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'], format='%Y-%m-%d')



In [8]:
# Holiday data addition for train data

# Create two separate columns: is_national_holiday and is_regional_holiday
train_merged['is_national_holiday'] = 0
train_merged['is_regional_holiday'] = 0

# Mark national holidays: for all rows where doj matches a national holiday date
national_dates = set(holiday_df.loc[holiday_df['Holiday_Type'].str.lower() == 'national', 'Date'])
train_merged.loc[train_merged['doj'].isin(national_dates), 'is_national_holiday'] = 1

# Mark regional holidays: for rows where (doj, destid_region) matches a regional holiday
regional_holidays = holiday_df.loc[holiday_df['Holiday_Type'].str.lower() != 'national', ['Date', 'destid_region']]
regional_holidays = regional_holidays.drop_duplicates()

# Merge to mark regional holidays
train_merged = train_merged.merge(
    regional_holidays.rename(columns={'Date': 'doj', 'destid_region': 'holiday_region'}),
    how='left',
    left_on=['doj', 'destid_region'],
    right_on=['doj', 'holiday_region']
)
train_merged['is_regional_holiday'] = train_merged['holiday_region'].notnull().astype(int)
train_merged.drop(columns=['holiday_region'], inplace=True)


In [9]:
test_merged = pd.merge(
    test_df,
    transaction_df,
    on=['doj', 'srcid', 'destid'],
    how='left'
)

In [10]:
test_merged['doj'] = pd.to_datetime(test_merged['doj'], format = '%Y-%m-%d')

In [11]:
# holiday date addition for test data

# Create two separate columns: is_national_holiday and is_regional_holiday
test_merged['is_national_holiday'] = 0
test_merged['is_regional_holiday'] = 0

# Mark national holidays: for all rows where doj matches a national holiday date
national_dates = set(holiday_df.loc[holiday_df['Holiday_Type'].str.lower() == 'national', 'Date'])
test_merged.loc[test_merged['doj'].isin(national_dates), 'is_national_holiday'] = 1

# Mark regional holidays: for rows where (doj, destid_region) matches a regional holiday
regional_holidays = holiday_df.loc[holiday_df['Holiday_Type'].str.lower() != 'national', ['Date', 'destid_region']]
regional_holidays = regional_holidays.drop_duplicates()

# Merge to mark regional holidays
test_merged = test_merged.merge(
    regional_holidays.rename(columns={'Date': 'doj', 'destid_region': 'holiday_region'}),
    how='left',
    left_on=['doj', 'destid_region'],
    right_on=['doj', 'holiday_region']
)
test_merged['is_regional_holiday'] = test_merged['holiday_region'].notnull().astype(int)
test_merged.drop(columns=['holiday_region'], inplace=True)

In [12]:

def extract_datetime_features(df):
    # Save original columns order
    orig_cols = list(df.columns)

    # Compute new features
    new_features = {
        'doj_dayofweek': df['doj'].dt.dayofweek,  # Monday=0, Sunday=6
        'doj_month': df['doj'].dt.month,
        'doj_weekofyear': df['doj'].dt.isocalendar().week.astype(int),
        'doj_dayofmonth': df['doj'].dt.day,
        'doj_is_start_of_year': ((df['doj'].dt.month == 1) & (df['doj'].dt.day <= 7)).astype(int),
        'doj_is_end_of_year': ((df['doj'].dt.month == 12) & (df['doj'].dt.day >= 25)).astype(int)
    }

    # Add new features after existing columns
    for feat, values in new_features.items():
        df[feat] = values

    # Reorder columns: original columns first, then new features in the order defined above
    df = df[orig_cols + list(new_features.keys())]

    return df

train_merged = extract_datetime_features(train_merged)
test_merged = extract_datetime_features(test_merged)


In [13]:
# Function: Using DBD data to extract features.

def dbd_feature_extract(df):

    # Daily new seat calculations.

    df['daily_new_seat'] = (
        df.groupby(['doj', 'srcid', 'destid'])['cumsum_seatcount']
        .diff()
    )

    df['daily_new_search'] = (
        df.groupby(['doj', 'srcid', 'destid'])['cumsum_searchcount']
        .diff()
    )

    # Lag features

    df['lag1_seat_dbd'] = df.groupby(['doj', 'srcid', 'destid'])['daily_new_seat'].shift(1).fillna(0)
    df['lag7_seat_dbd'] = df.groupby(['doj', 'srcid', 'destid'])['daily_new_seat'].shift(7).fillna(0)

    df['lag1_search_dbd'] = df.groupby(['doj', 'srcid', 'destid'])['daily_new_search'].shift(1).fillna(0)
    df['lag7_search_dbd'] = df.groupby(['doj', 'srcid', 'destid'])['daily_new_search'].shift(7).fillna(0)
    
    

    return df

train_merged_dbd = dbd_feature_extract(train_merged)
test_merged_dbd = dbd_feature_extract(test_merged)


In [14]:
train_merged_dbd = train_merged_dbd[train_merged_dbd['dbd'] == 15]
test_merged_dbd = test_merged_dbd[test_merged_dbd['dbd'] == 15]

In [15]:
# Fourier features

def ff(df,agg_df):

    df['days_since_start'] = (df['doj'] - agg_df['doj'].min()).dt.days

    # for test data, use min doj as the train's data minimum doj
    # Weekly periodicity: sin/cos terms for 7-day cycle

    df['sin_week'] = np.sin(2 * pi * df['days_since_start'] / 7)
    df['cos_week'] = np.cos(2 * pi * df['days_since_start'] / 7)

    return df

train_final = ff(train_merged_dbd, train_merged)
test_final = ff(test_merged_dbd, train_merged)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['days_since_start'] = (df['doj'] - agg_df['doj'].min()).dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sin_week'] = np.sin(2 * pi * df['days_since_start'] / 7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cos_week'] = np.cos(2 * pi * df['days_since_start'] / 7)
A value is tryin

In [16]:
train_final.columns, test_final.columns

(Index(['doj', 'srcid', 'destid', 'final_seatcount', 'doi', 'srcid_region',
        'destid_region', 'srcid_tier', 'destid_tier', 'cumsum_seatcount',
        'cumsum_searchcount', 'dbd', 'is_national_holiday',
        'is_regional_holiday', 'doj_dayofweek', 'doj_month', 'doj_weekofyear',
        'doj_dayofmonth', 'doj_is_start_of_year', 'doj_is_end_of_year',
        'daily_new_seat', 'daily_new_search', 'lag1_seat_dbd', 'lag7_seat_dbd',
        'lag1_search_dbd', 'lag7_search_dbd', 'days_since_start', 'sin_week',
        'cos_week'],
       dtype='object'),
 Index(['route_key', 'doj', 'srcid', 'destid', 'doi', 'srcid_region',
        'destid_region', 'srcid_tier', 'destid_tier', 'cumsum_seatcount',
        'cumsum_searchcount', 'dbd', 'is_national_holiday',
        'is_regional_holiday', 'doj_dayofweek', 'doj_month', 'doj_weekofyear',
        'doj_dayofmonth', 'doj_is_start_of_year', 'doj_is_end_of_year',
        'daily_new_seat', 'daily_new_search', 'lag1_seat_dbd', 'lag7_seat_dbd',
 

In [17]:
# Remove specified features from train_final
features_to_remove = [
    'doj', 'doi', 'srcid_region',
    'destid_region', 'daily_new_seat', 'daily_new_search','dbd',
    
    # 'lag1_seat_dbd', 'lag7_seat_dbd', 'lag1_search_dbd', 'lag7_search_dbd', 'days_since_start', 'sin_week', 'cos_week'
]
train_final = train_final.drop(columns=[col for col in features_to_remove if col in train_final.columns])

        

In [18]:
features_to_remove_test = [
    'doj', 'doi', 'srcid_region',
    'destid_region', 'daily_new_seat', 'daily_new_search','route_key','dbd'

    # 'lag1_seat_dbd', 'lag7_seat_dbd', 'lag1_search_dbd', 'lag7_search_dbd', 'days_since_start', 'sin_week', 'cos_week','lag1_seat_dbd'
]
test_final = test_final.drop(columns=[col for col in features_to_remove_test if col in test_final.columns])

In [19]:
from sklearn.preprocessing import OneHotEncoder

# Features to one-hot encode
onehot_features = ['srcid', 'destid', 'srcid_tier', 'destid_tier']

# Use 'sparse_output' instead of 'sparse' for newer scikit-learn versions
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit on train_final
encoder.fit(train_final[onehot_features])

# Transform train and test
train_encoded = encoder.transform(train_final[onehot_features])
test_encoded = encoder.transform(test_final[onehot_features])

# Get new column names
encoded_cols = encoder.get_feature_names_out(onehot_features)

# Convert to DataFrame
import pandas as pd
train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_cols, index=train_final.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_cols, index=test_final.index)

# Drop original columns and concat encoded
train_final = train_final.drop(columns=onehot_features)
test_final = test_final.drop(columns=onehot_features)

train_final = pd.concat([train_final, train_encoded_df], axis=1)
test_final = pd.concat([test_final, test_encoded_df], axis=1)


In [20]:
# Training phase

In [21]:
X = train_final.drop(columns=['final_seatcount'])
y = train_final['final_seatcount']


In [45]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Standardize for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [47]:
# --------------------------
# 📘 Model dictionary
# --------------------------
models = {
    # 'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    # 'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42),
    'CatBoost': CatBoostRegressor(n_estimators=100, random_state=42, verbose=0)
}

# --------------------------
# 🔁 Model Training + Evaluation
# --------------------------
results = []
model_names = []  # List to store model names after each iteration
model_predictions = {}  # Dictionary to store predictions for each model

for name, model in models.items():
    print(f"Training {name}...")

    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results.append({
        'Model_name': name,
        'RMSE': float(np.round(rmse, 2))
    })

    test_preds = model.predict(test_final)
    model_names.append(name)  # Store the model name as a variable
    model_predictions[name] = test_preds  # Store predictions for each model

# --------------------------
# 🏁 Display Results
# --------------------------
results_df = pd.DataFrame(results).sort_values(by='RMSE')
print("\n🔍 Model Comparison (sorted by RMSE):\n")
print(results_df.to_string(index=False))



Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2227
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 115
[LightGBM] [Info] Start training from score 2003.632533
Training CatBoost...

🔍 Model Comparison (sorted by RMSE):

Model_name   RMSE
  CatBoost 398.68
   XGBoost 400.86
  LightGBM 420.98


In [25]:
# make predictions for test data

In [48]:
test_pred_cb = model_predictions['CatBoost']
test_pred_cb

array([4212.26288505, 2000.89123365, 1103.62965954, ..., 2033.32001011,
       1550.387004  , 2455.67541932])

In [36]:

submission = test_merged_dbd[['route_key']].copy()
submission['final_seatcount'] = test_pred_cb
submission.to_csv("final_submission_catboost.csv", index=False)


In [28]:
# Other approach

In [29]:
# X = train_final.drop(columns=['final_seatcount'])
# y = train_final['final_seatcount']


In [30]:


# import pandas as pd
# import numpy as np

# from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
# from sklearn.metrics import make_scorer, mean_squared_error

# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor



# # --------------------------
# # 2. Time-Series Split
# # --------------------------
# tscv = TimeSeriesSplit(n_splits=3)

# # Custom RMSE scorer (negated for sklearn compatibility)
# rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
#                           greater_is_better=False)

# # --------------------------
# # 3. Define Parameter Grids
# # --------------------------
# param_grids = {
#     'XGBoost': {
#         'n_estimators': [100, 300, 500],
#         'learning_rate': [0.01, 0.05, 0.1],
#         'max_depth': [4, 6, 8, 10],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'reg_alpha': [0, 0.1, 1],
#         'reg_lambda': [1, 5, 10]
#     },
#     'LightGBM': {
#         'n_estimators': [100, 300, 500],
#         'learning_rate': [0.01, 0.05, 0.1],
#         'num_leaves': [31, 50, 100],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'reg_alpha': [0, 0.1, 1],
#         'reg_lambda': [1, 5, 10]
#     },
#     'CatBoost': {
#         'iterations': [100, 300, 500],
#         'learning_rate': [0.01, 0.05, 0.1],
#         'depth': [4, 6, 8, 10],
#         'l2_leaf_reg': [1, 3, 5, 7],
#         'border_count': [32, 64, 128]
#     }
# }

# # --------------------------
# # 4. Initialize Models
# # --------------------------
# models = {
#     'XGBoost': XGBRegressor(random_state=42, verbosity=0),
#     'LightGBM': LGBMRegressor(random_state=42),
#     'CatBoost': CatBoostRegressor(random_state=42, verbose=0)
# }

# # --------------------------
# # 5. Hyperparameter Tuning
# # --------------------------
# best_estimators = {}
# for name, model in models.items():
#     print(f"==> Tuning {name}")
#     param_dist = param_grids[name]
#     search = RandomizedSearchCV(
#         estimator=model,
#         param_distributions=param_dist,
#         n_iter=50,
#         scoring=rmse_scorer,
#         cv=tscv,
#         random_state=42,
#         n_jobs=-1,
#         verbose=1
#     )
#     search.fit(X, y)
#     best_estimators[name] = {
#         'best_model': search.best_estimator_,
#         'best_params': search.best_params_,
#         'best_rmse': -search.best_score_
#     }

# # --------------------------
# # 6. Display Best Results
# # --------------------------
# results = pd.DataFrame([
#     {
#         'Model': name,
#         'Best_RMSE': info['best_rmse'],
#         'Best_Params': info['best_params']
#     }
#     for name, info in best_estimators.items()
# ]).sort_values('Best_RMSE')

# results
