In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
from sklearn.neighbors import NearestNeighbors

KeyboardInterrupt: 

In [None]:
df = pd.read_csv(r'C:\Users\eng_bigdata\Desktop\huiling\hotels.csv', encoding='ISO-8859-1')

# 1. Data Cleaning & Preprocessing

## 1.1 Basic Data Understanding

In [None]:
df.columns

Index(['countyCode', ' countyName', ' cityCode', ' cityName', ' HotelCode',
       ' HotelName', ' HotelRating', ' Address', ' Attractions',
       ' Description', ' FaxNumber', ' HotelFacilities', ' Map',
       ' PhoneNumber', ' PinCode', ' HotelWebsiteUrl'],
      dtype='object')

In [None]:
df.shape

(1010033, 16)

## 1.2 Geospatial Data Cleaning

In [None]:
# removes leading/trailing spaces from all column names
df.rename(columns=lambda x: x.strip(), inplace=True)  

# split the 'Map' column (which contains "latitude|longitude" strings)
df[['latitude', 'longitude']] = df['Map'].str.split('|', expand=True)

# convert latitude and longtitude column from string to float 
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# drop the original 'Map' column
df.drop(columns=['Map'], inplace=True)

# drop rows with missing or invalid coordinates
df = df[df['latitude'].notna() & df['longitude'].notna()]


In [None]:
df.shape

(1009101, 17)

## 1.3 Handle Missing Value

In [None]:
df.isnull().sum()

countyCode            912
countyName              0
cityCode                0
cityName                0
HotelCode               0
HotelName               0
HotelRating             0
Address               101
Attractions        524475
Description         46871
FaxNumber          559594
HotelFacilities     49502
PhoneNumber        326461
PinCode             30383
HotelWebsiteUrl    249240
latitude                0
longitude               0
dtype: int64

In [None]:
# Drop unneeded columns
df.drop(columns=['FaxNumber', 'PinCode'], inplace=True)

# Fill missing text fields
text_cols = ['Address', 'Attractions', 'Description', 'HotelFacilities']
df[text_cols] = df[text_cols].fillna('')

# Handle countyCode
df['countyCode'] = df['countyCode'].fillna('unknown')


In [None]:
# Filter hotels that have zero useful content (i.e. all 4 text fields empty)
df['info_score'] = df[text_cols].apply(lambda x: sum([bool(s.strip()) for s in x]), axis=1)
df = df[df['info_score'] >= 1]  # keep hotels with at least 1 non-empty text feature

In [None]:
df.shape

(1009007, 16)

In [None]:
rating_order = {
    'OneStar': 1,
    'TwoStar': 2,
    'ThreeStar': 3,
    'FourStar': 4,
    'FiveStar': 5,
    'All': 0  # optional catch-all
}

# Create a numerical column for easier sorting
df['RatingValue'] = df['HotelRating'].map(rating_order)


# 2. Recommendation System Modelling

# 2.1 Location-Based Filtering (with optional radius)

In [None]:
from geopy.distance import geodesic

# Function to filter by city and find nearby hotels (based on latitude/longitude)
def filter_by_location(city_name, radius_km=10):
    # Filter by city
    city_hotels = df[df['cityName'] == city_name]
    
    # Get the base city hotel's coordinates
    base_lat = city_hotels.iloc[0]['latitude']
    base_lon = city_hotels.iloc[0]['longitude']
    
    base_coord = (base_lat, base_lon)
    
    # Calculate distances for all hotels in the city
    city_hotels['distance_km'] = city_hotels.apply(
        lambda row: geodesic(base_coord, (row['latitude'], row['longitude'])).km, axis=1)
    
    # Filter by radius 
    nearby_hotels = city_hotels[city_hotels['distance_km'] <= radius_km]
    
    return nearby_hotels


# 2.2 Content-Based Filtering (TF-IDF + NearestNeighbors)

In [None]:
# Create a combined text feature for content-based filtering
df['combined_features'] = df['HotelFacilities'] + ' ' + df['Attractions'] + ' ' + df['Description']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Vectorize the combined text features
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

In [None]:
# Fit the NearestNeighbors model
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
model.fit(tfidf_matrix)

# 2.3 Recommend Similar Hotel (Locations, Ratings, Facilities)

In [None]:
def recommend_similar_hotels_priority(hotel_name, city_name, radius_km=10, top_n=5):
    # Get selected hotel info
    selected_hotel = df[df['HotelName'] == hotel_name].iloc[0]
    selected_rating = selected_hotel['HotelRating']
    selected_rating_value = rating_order[selected_rating]
    
    # Step 1: Filter by city (or radius if using lat/lon)
    city_hotels = df[df['cityName'].str.lower() == city_name.lower()]
    
    # Step 2: Prioritize same rating hotels
    same_rating_hotels = city_hotels[city_hotels['HotelRating'] == selected_rating]

    # Step 3: Fallback to nearby hotels with similar but lower/higher ratings (optional)
    if len(same_rating_hotels) < top_n:
        other_hotels = city_hotels[city_hotels['HotelRating'] != selected_rating]
        other_hotels = other_hotels.sort_values(by='RatingValue', ascending=False)
        fallback_hotels = pd.concat([same_rating_hotels, other_hotels]).head(top_n + 1)
    else:
        fallback_hotels = same_rating_hotels

    # Step 4: Compute TF-IDF similarity within these hotels
    tfidf_features = vectorizer.transform(fallback_hotels['combined_features'])
    selected_vector = vectorizer.transform([selected_hotel['combined_features']])
    cosine_sim = cosine_similarity(selected_vector, tfidf_features).flatten()

    fallback_hotels = fallback_hotels.copy()
    fallback_hotels['Similarity'] = cosine_sim
    fallback_hotels = fallback_hotels[fallback_hotels['HotelName'] != hotel_name]


    # Step 5: Sort by rating and similarity
    results = fallback_hotels.sort_values(by=['RatingValue', 'Similarity'], ascending=[False, False])

    return results.head(top_n)


In [None]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, r'C:\Users\eng_bigdata\Desktop\huiling\tfidf_vectorizer.pkl')

# Save the DataFrame
df.to_pickle(r'C:\Users\eng_bigdata\Desktop\huiling\hotel_data.pkl')


In [None]:
joblib.dump(tfidf_matrix, r'C:\Users\eng_bigdata\Desktop\huiling\tfidf_matrix.pkl')

['C:\\Users\\eng_bigdata\\Desktop\\huiling\\tfidf_matrix.pkl']