In [5]:
!pip install pandas numpy scikit-learn scipy




In [6]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.sparse.linalg import svds

# Load the dataset
hotel_data = pd.read_csv('Hotel_Goibibo_NoNull.csv')
hotel_data.head()

Unnamed: 0,property_name,state,city,area,address,hotel_star_rating,property_type,hotel_brand,hotel_description,hotel_facilities,...,room_type,room_count,point_of_interest,rating,additional_rating,review_count_by_category,site_review_count,guest_recommendation,latitude,longitude
0,Baragarh Regency,Himachal Pradesh,Manali,Others,"15th Mile, N.H.21,Manali, District Kullu,Himac...",2,Resort,no brand,The standard check-in time is 12:00 PM and the...,Doctor on Call|Dry Cleaning|Laundry Service Av...,...,Deluxe Room,17,Hadimba Temple|Naggar Village|Himalayan Nyingm...,4.0,Service Quality::3.9|Amenities::3.7|Food and D...,positive reviews::74|critical reviews::13|revi...,87.0,85.0,32.139387,77.15466
1,Asian Suites A- 585,Haryana,Gurgaon,Sushant Lok,"A-585, Sushant Lok-1 ,Near Iffco Chowk Metro S...",0,Guest House,no brand,The standard check-in time is 12:00 PM and the...,Airport Transfer Available / Surcharge|Banquet...,...,Deluxe Room With Free WIFI,18,Sushant Lok|Sahara Mall|Amity International Sc...,4.5,Service Quality::4.7|Amenities::4.7|Food and D...,positive reviews::8|critical reviews::0|review...,8.0,87.0,28.472097,77.072546
2,Bevvan Resort,Goa,Goa,Calangute Area,"Cobra Vaddo,Calungate Baga Road, Bardez, Calan...",0,Resort,no brand,The standard check-in time is 12:00 PM and the...,Swimming Pool|Bar / Lounge |Laundry Service Av...,...,Standard Room,15,"Anjuna Beach|Calangute Beach|Titos lane, baga|...",2.5,Service Quality::2.5|Amenities::2.5|Food and D...,positive reviews::1|critical reviews::1|review...,2.0,50.0,15.548398,73.757634
3,Apple Inn Cottage,Himachal Pradesh,Manali,Village Simsa,Simsa,2,Cottage,no brand,The standard check-in time is 12:00 PM and the...,Doctor on Call|Dry Cleaning|Laundry Service Av...,...,Deluxe Room,24,The Mall|Tibetian Monastery|Vashisht Hot Baths...,5.0,Service Quality::5.0|Amenities::5.0|Food and D...,positive reviews::1|critical reviews::0|review...,1.0,100.0,32.223603,77.1859
4,Anmol Hotel Pvt.Ltd,Delhi,Delhi,Paharganj,"8180 Street No.-6,Arakashan Road,Paharganj",2,Hotel,no brand,The standard check-in time is 12:00 PM and the...,Internet Access - Surcharge|Laundry Service Av...,...,Standard Room Non AC,20,Gaffar Market|YMCA Institute|Agrasen Boali|Cha...,2.8,Service Quality::2.7|Amenities::2.6|Food and D...,positive reviews::56|critical reviews::65|revi...,121.0,63.0,28.646777,77.212735


In [7]:
# Features for the model
features = ['room_facilities', 'review_count_by_category', 'guest_recommendation', 'rating', 'hotel_facilities']


In [8]:
# Encode categorical data
label_encoders = {}
for feature in features:
    le = LabelEncoder()
    hotel_data[feature] = le.fit_transform(hotel_data[feature])
    label_encoders[feature] = le


In [None]:
# Standardizing the features
scaler = StandardScaler()
hotel_data[features] = scaler.fit_transform(hotel_data[features])

In [9]:
# Splitting the data into features (X) and target (y)
X = hotel_data[features]
y = hotel_data['rating']

# Train-Test Split (for validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# K-Nearest Neighbors Regressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)


In [11]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)


In [12]:
# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [13]:
# Support Vector Regressor (SVR)
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train, y_train)

In [14]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# Convert the user-item matrix to a sparse matrix
user_item_matrix = hotel_data.pivot_table(index='property_name', columns='city', values='rating').fillna(0)
user_item_sparse_matrix = csr_matrix(user_item_matrix.values)

# Perform SVD on the sparse matrix
U, sigma, Vt = svds(user_item_sparse_matrix, k=50)
sigma = np.diag(sigma)

# Predicted Ratings for Matrix Factorization
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)

# The predicted_ratings_df now contains the reconstructed matrix
# You can use this matrix for making predictions


In [15]:
# Predicted Ratings for Matrix Factorization
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)


In [16]:
# Take user input for hotel rating, city, and amenities
user_input_rating = float(input("Enter the minimum hotel rating (0-5): "))
user_input_city = input("Enter the city: ").strip().lower()
user_input_amenities = input("Enter amenities (comma-separated): ").strip().lower()

# Filter data based on user input
filtered_data = hotel_data[(hotel_data['rating'] >= user_input_rating) & 
                           (hotel_data['city'].str.lower() == user_input_city)]

# Debugging: Check the number of records matching each filter step
print(f"Total records in dataset: {len(hotel_data)}")
print(f"Records after rating filter: {len(hotel_data[hotel_data['rating'] >= user_input_rating])}")
print(f"Records after city filter: {len(hotel_data[hotel_data['city'].str.lower() == user_input_city])}")

# If no records match, suggest broadening the search criteria
if filtered_data.empty:
    print("\nNo hotels found matching the criteria.")
    print("Suggestions to broaden your search:")
    print("1. Try a lower minimum rating.")
    print("2. Check for alternative city names (e.g., nearby cities).")
    print("3. Reduce the number of required amenities.")
else:
    # Predict ratings using trained models
    knn_preds = knn.predict(filtered_data[features])
    lr_preds = lr.predict(filtered_data[features])
    rf_preds = rf.predict(filtered_data[features])
    svr_preds = svr.predict(filtered_data[features])

    # Combine predictions from different models (weighted average or simple mean)
    filtered_data['predicted_rating'] = (knn_preds + lr_preds + rf_preds + svr_preds) / 4

    # Get top 5 hotels based on the predicted rating
    top_5_hotels = filtered_data.sort_values(by='predicted_rating', ascending=False).head(5)

    # Display the recommendations in a table format
    top_5_hotels = top_5_hotels[['property_name', 'address', 'rating', 'city']]
    print("\nTop 5 Hotel Recommendations:")
    print(top_5_hotels)


Total records in dataset: 3756
Records after rating filter: 3756
Records after city filter: 45

Top 5 Hotel Recommendations:
                   property_name  \
2294  Treebo Globe International   
2235                  The Sonnet   
3371  VISITEL - A BOUTIQUE HOTEL   
1663                  De Sovrani   
2338           Treebo Ivy Suites   

                                                address  rating     city  
2294  opp Sangam Beauty Parlour, Near Fire Station, ...      33  Kolkata  
2235  Block: DD, Plot No: 8, Sector  1, Salt Lake, K...      31  Kolkata  
3371  78, Canal Circular Road, Besides  Apollo Glene...      39  Kolkata  
1663  DD 21, Saltlake City Sector I, near to City ce...      32  Kolkata  
2338  Address-17/a, Ekdalia Place, Near ballygaunge ...      31  Kolkata  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['predicted_rating'] = (knn_preds + lr_preds + rf_preds + svr_preds) / 4


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluate each model
def evaluate_model(model_name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    r2_percentage = r2 * 100
    
    print(f"{model_name} Performance:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R-squared: {r2:.4f} ({r2_percentage:.2f}%)\n")

# Assuming you have test data (X_test and y_test)
# Make predictions using each model
knn_preds = knn.predict(X_test)
lr_preds = lr.predict(X_test)
rf_preds = rf.predict(X_test)
svr_preds = svr.predict(X_test)

# Evaluate each model
evaluate_model("KNN", y_test, knn_preds)
evaluate_model("Linear Regression", y_test, lr_preds)
evaluate_model("Random Forest Regressor", y_test, rf_preds)
evaluate_model("Support Vector Regressor (SVR)", y_test, svr_preds)

# If you have the predicted ratings from Matrix Factorization:
# evaluate_model("Matrix Factorization", y_test, matrix_factorization_preds)

# Combine predictions (optional)
final_preds = (knn_preds + lr_preds + rf_preds + svr_preds) / 4
evaluate_model("Combined Model", y_test, final_preds)


KNN Performance:
MAE: 4.4072
MSE: 50.3007
RMSE: 7.0923
R-squared: 0.7611 (76.11%)

Linear Regression Performance:
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
R-squared: 1.0000 (100.00%)

Random Forest Regressor Performance:
MAE: 0.0021
MSE: 0.0004
RMSE: 0.0206
R-squared: 1.0000 (100.00%)

Support Vector Regressor (SVR) Performance:
MAE: 5.6578
MSE: 72.6209
RMSE: 8.5218
R-squared: 0.6551 (65.51%)

Combined Model Performance:
MAE: 2.3652
MSE: 13.0960
RMSE: 3.6188
R-squared: 0.9378 (93.78%)

