In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [136]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,Trip ID,User ID,Destination,Departure Date,Return Date,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction
0,TRIP001,USER6303,Sydney,28-07-2024,14-09-2024,Cultural,Bus,4,2297.38,1
1,TRIP002,USER1398,Tokyo,04-01-2025,20-02-2025,Leisure,Flight,3,4171.03,1
2,TRIP003,USER8452,New York,14-09-2024,06-03-2025,Business,Cruise,4,695.11,3
3,TRIP004,USER5289,Sydney,06-07-2024,14-08-2024,Adventure,Flight,2,1371.81,3
4,TRIP005,USER9783,New York,26-08-2024,25-11-2024,Cultural,Bus,2,691.52,7


In [137]:
train.shape

(19800, 10)

In [138]:
train.duplicated().sum()

0

In [139]:
train.describe()

Unnamed: 0,Hotel Rating,Total Cost ($),Customer_Satisfaction
count,19800.0,19800.0,19800.0
mean,3.000202,2739.43811,5.511212
std,1.407734,1301.720853,2.879006
min,1.0,500.04,1.0
25%,2.0,1606.9325,3.0
50%,3.0,2730.6,6.0
75%,4.0,3862.11,8.0
max,5.0,4999.58,10.0


In [140]:
train.isnull().sum()

Trip ID                  0
User ID                  0
Destination              0
Departure Date           0
Return Date              0
Travel Type              0
Transportation Mode      0
Hotel Rating             0
Total Cost ($)           0
Customer_Satisfaction    0
dtype: int64

In [141]:
test = pd.read_csv('Test.csv')
test.head()

Unnamed: 0,Trip ID,User ID,Destination,Departure Date,Return Date,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction
0,TRIP10651,USER1429,Tokyo,24-11-2024,26-12-2024,Family,Flight,2,2773.07,
1,TRIP02042,USER8789,Rome,03-12-2024,21-02-2025,Cultural,Train,1,3809.85,
2,TRIP08669,USER1090,Barcelona,04-07-2024,11-02-2025,Leisure,Flight,5,2648.33,
3,TRIP01115,USER5405,Paris,17-06-2024,19-08-2024,Family,Flight,3,3802.0,
4,TRIP13903,USER9119,Dubai,18-11-2024,04-01-2025,Cultural,Train,2,2982.42,


In [142]:
test.shape

(200, 10)

In [143]:
test.duplicated().sum()

0

In [144]:
test.isnull().sum()

Trip ID                    0
User ID                    0
Destination                0
Departure Date             0
Return Date                0
Travel Type                0
Transportation Mode        0
Hotel Rating               0
Total Cost ($)             0
Customer_Satisfaction    200
dtype: int64

In [145]:
train['Customer_Satisfaction'].value_counts()

Customer_Satisfaction
10    2050
7     2022
1     1997
4     1988
3     1971
6     1971
5     1960
8     1953
2     1948
9     1940
Name: count, dtype: int64

In [146]:
combined = pd.concat([train, test], axis = 0)
combined.head()

Unnamed: 0,Trip ID,User ID,Destination,Departure Date,Return Date,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction
0,TRIP001,USER6303,Sydney,28-07-2024,14-09-2024,Cultural,Bus,4,2297.38,1.0
1,TRIP002,USER1398,Tokyo,04-01-2025,20-02-2025,Leisure,Flight,3,4171.03,1.0
2,TRIP003,USER8452,New York,14-09-2024,06-03-2025,Business,Cruise,4,695.11,3.0
3,TRIP004,USER5289,Sydney,06-07-2024,14-08-2024,Adventure,Flight,2,1371.81,3.0
4,TRIP005,USER9783,New York,26-08-2024,25-11-2024,Cultural,Bus,2,691.52,7.0


In [147]:
combined.shape

(20000, 10)

In [148]:
combined['Trip ID'].nunique()

20000

In [149]:
combined['User ID'].nunique()

8041

In [150]:
combined['Destination'].nunique()

10

In [151]:
combined['Travel Type'].nunique()

6

In [152]:
combined['Transportation Mode'].nunique()

5

In [153]:
combined.drop(columns=['Trip ID', 'User ID'], inplace=True)

In [154]:
combined['Departure Date'] = pd.to_datetime(combined['Departure Date'], format='%d-%m-%Y')
combined['Return Date'] = pd.to_datetime(combined['Return Date'], format='%d-%m-%Y')

In [155]:
combined['Trip Duration'] = (combined['Return Date'] - combined['Departure Date']).dt.days

In [156]:
combined.drop(columns=['Departure Date', 'Return Date'], inplace=True)

In [162]:
categorical_cols = ['Destination', 'Travel Type', 'Transportation Mode']
encoder = LabelEncoder()
for col in categorical_cols:
    combined[col] = LabelEncoder().fit_transform(combined[col])

In [164]:
combined.head()

Unnamed: 0,Destination,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction,Trip Duration
0,8,3,0,4,2297.38,1.0,48
1,9,5,3,3,4171.03,1.0,47
2,5,2,2,4,695.11,3.0,173
3,8,0,3,2,1371.81,3.0,39
4,5,3,0,2,691.52,7.0,91


In [166]:
newtrain = combined.iloc[0:19800, :]
newtest = combined.iloc[19800: , :]

In [168]:
newtrain.shape

(19800, 7)

In [170]:
newtest.shape

(200, 7)

In [172]:
newtrain.head()

Unnamed: 0,Destination,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction,Trip Duration
0,8,3,0,4,2297.38,1.0,48
1,9,5,3,3,4171.03,1.0,47
2,5,2,2,4,695.11,3.0,173
3,8,0,3,2,1371.81,3.0,39
4,5,3,0,2,691.52,7.0,91


In [197]:
newtest.head()

Unnamed: 0,Destination,Travel Type,Transportation Mode,Hotel Rating,Total Cost ($),Customer_Satisfaction,Trip Duration
0,9,4,3,2,2773.07,,32
1,7,3,4,1,3809.85,,80
2,1,5,3,5,2648.33,,222
3,6,4,3,3,3802.0,,63
4,2,3,4,2,2982.42,,47


In [176]:
X = newtrain.drop(columns=['Customer_Satisfaction'])
y = newtrain['Customer_Satisfaction']

In [178]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [180]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)

In [182]:
rf_preds = rf_model.fit(X_train, y_train).predict(X_val)

In [183]:
xgb_preds = xgb_model.fit(X_train, y_train).predict(X_val)

In [184]:
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_preds))

In [185]:
print(f"Random Forest RMSE: {rf_rmse}")
print(f"XGBoost RMSE: {xgb_rmse}")

Random Forest RMSE: 2.974288574643107
XGBoost RMSE: 3.0338525930367863


In [186]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(XGBRegressor(random_state=42), param_grid, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [187]:
best_params = grid_search.best_params_
best_rmse = -grid_search.best_score_

print(f"Best XGBoost Parameters: {best_params}")
print(f"Best XGBoost RMSE: {best_rmse}")


Best XGBoost Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best XGBoost RMSE: 2.878489804235111


In [199]:
x_train = newtrain.drop('Customer_Satisfaction', axis = 1)
y_train = newtrain['Customer_Satisfaction']
x_test = newtest.drop('Customer_Satisfaction', axis = 1)

In [209]:
xgb = XGBRegressor(learning_rate = 0.01, max_depth = 3, n_estimators =  100, subsample = 0.8)

In [211]:
y_pred = xgb.fit(x_train, y_train).predict(x_test)

In [221]:
solution = pd.DataFrame(y_pred, columns= ['Customer_Satisfaction'])
solution.head()

Unnamed: 0,Customer_Satisfaction
0,5.47682
1,5.53133
2,5.515862
3,5.544132
4,5.450007


In [223]:
solution.to_csv('Solution.csv', index = False)