In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [2]:
# Load dataset
df = pd.read_csv('D:\Restaurant Recommendation Project\data\clean_data.csv')
df.head()

Unnamed: 0,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,Mean Rating,normalized_cost,normalized_rating,text
0,Jalsa,True,True,4.1,Banashankari,"North Indian, Mughlai, Chinese",800.0,rated 5 0 rated n great ambience looking nice ...,Banashankari,4.118182,0.841939,0.747801,north indian mughlai chinese banashankari rate...
1,Spice Elephant,True,False,4.1,Banashankari,"Chinese, North Indian, Thai",800.0,rated 5 0 rated n great ambience looking nice ...,Banashankari,4.1,0.841939,0.741935,chinese north indian thai banashankari rated 5...
2,San Churro Cafe,True,False,3.8,Banashankari,"Cafe, Mexican, Italian",800.0,rated 5 0 rated n great ambience looking nice ...,Banashankari,3.8,0.841939,0.645161,cafe mexican italian banashankari rated 5 0 ra...
3,Addhuri Udupi Bhojana,False,False,3.7,Banashankari,"South Indian, North Indian",300.0,rated 5 0 rated n great ambience looking nice ...,Banashankari,3.7,0.315068,0.612903,south indian north indian banashankari rated 5...
4,Grand Village,False,False,3.8,Basavanagudi,"North Indian, Rajasthani",600.0,rated 5 0 rated n great ambience looking nice ...,Banashankari,3.8,0.631191,0.645161,north indian rajasthani basavanagudi rated 5 0...


In [16]:
import re

# Function to clean unwanted characters in a string
def clean_string(text):
    # Remove any non-ASCII characters
    # Replacing non-ASCII characters with an empty string
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Optionally, replace multiple spaces or unwanted spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

# Apply the function to the 'name' column to clean it
df['name'] = df['name'].apply(lambda x: clean_string(x) if isinstance(x, str) else x)

# Check the cleaned 'name' column
print(df['name'].head())


0                    Jalsa
1           Spice Elephant
2          San Churro Cafe
3    Addhuri Udupi Bhojana
4            Grand Village
Name: name, dtype: object


### "Due to memory constraints, I utilized 40% of the dataset."


In [17]:
# Randomly sample 60% of the data 
df_sample = df.sample(frac=0.4, random_state=42)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the 'text' column into a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(df_sample['text'])

# Compute the cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Example: Check the similarity of the first restaurant with others
print(cosine_sim[0])


[1.         0.95472335 0.95254151 ... 0.94348103 0.93333741 0.94502416]


In [25]:
def get_content_based_recommendations(restaurant_name, top_n=5):
    # Get the index of the restaurant that matches the restaurant_name
    idx = df_sample[df_sample['name'] == restaurant_name].index[0]
    
    # Get the pairwise similarity scores of all restaurants with that restaurant
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the restaurants based on similarity scores (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the input restaurant by starting from the second element (to avoid self-similarity)
    sim_scores = sim_scores[1:]
    
    # List to store recommended restaurants
    recommended_restaurants = []
    
    # Get the top N most similar restaurants (from the sorted list)
    for i in sim_scores:
        restaurant = df_sample['name'].iloc[i[0]]
        
        # Ensure the restaurant is not the input restaurant and not already in the recommendations list
        if restaurant != restaurant_name and restaurant not in recommended_restaurants:
            recommended_restaurants.append(restaurant)
        
        # Stop once we have the exact top N recommendations
        if len(recommended_restaurants) == top_n:
            break
    
    return recommended_restaurants

# Test the recommendation function 
recommended = get_content_based_recommendations('Jalsa', top_n=5)
print("Recommended Restaurants:", recommended)


Recommended Restaurants: ["Royce' Chocolate", 'Yauatcha PTisserie', 'Sugar & Spice - Taj Mg Road Bengaluru', 'Ice N Spice', 'Nithyothsav - Hotel Ramanashree']


In [26]:
import pickle

# Save the function separately
with open('models/get_content_based_recommendations.pkl', 'wb') as f:
    pickle.dump(get_content_based_recommendations, f)

# Save the DataFrame (df_sample) separately
with open('models/df_sample.pkl', 'wb') as f:
    pickle.dump(df_sample, f)

# Save the cosine similarity matrix (cosine_sim) separately
with open('models/cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

print("Function and dependencies saved successfully!")


Function and dependencies saved successfully!


In [27]:
# Load the function
with open('models/get_content_based_recommendations.pkl', 'rb') as f:
    get_content_based_recommendations = pickle.load(f)

# Load the DataFrame (df_sample)
with open('models/df_sample.pkl', 'rb') as f:
    df_sample = pickle.load(f)

# Load the cosine similarity matrix (cosine_sim)
with open('models/cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

# Test the recommendation function (example: for 'Jalsa')
recommended = get_content_based_recommendations('Jalsa', top_n=5)
print("Recommended Restaurants:", recommended)


Recommended Restaurants: ["Royce' Chocolate", 'Yauatcha PTisserie', 'Sugar & Spice - Taj Mg Road Bengaluru', 'Ice N Spice', 'Nithyothsav - Hotel Ramanashree']
