In [711]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [712]:
df_attractions=pd.read_csv("Attractions.csv")
df_attractions.head()

Unnamed: 0,ID,Name,Description,Open Timings,Location,Category,Rating,Review Count
0,1,Toronto Island Park,"Toronto's outdoor haven: bike trails, picnics ...",10:00 AM - 5:00 PM,Toronto,"Islands, Parks",4.5,8564
1,2,Canadian Warplane Heritage Museum,Canadian Warplane Heritage Museum features an ...,9:00 AM- 5:00PM,Hamilton,Speciality Museums,4.5,1119
2,3,Jungle Cat World Wildlife Park,Jungle Cat World Wildlife Park is a wildlife s...,10:00 AM- 5:00 PM,Orono,"Nature and Wildlife Areas, Zoos",4.5,1081
3,4,The Grotto,The Grotto is a natural sea cave located withi...,,Tobermory,"Points of Interest, Landmarks",4.5,1261
4,5,Canada Aviation and Space museum,Located on a former military air base just 5 k...,10:00 AM - 5:00 PM,Ottawa,Speciality Museums,4.5,1454


In [713]:
df_user_preferences=pd.read_csv("User Preferences.csv")
df_user_preferences.head()

Unnamed: 0,User ID,Preferred Categories,Preferred Locations,Past Visit History,Rating and Review History
0,1001,"Islands, Parks, Waterfalls","Toronto, Niagara Falls","Toronto Islands, Niagara falls, Albion Falls","Toronto Islands: 4.5 stars, Niagara Falls: 5 s..."
1,1002,"Wineries and Vineyards, Gardens","Niagara-on-the-Lake, Prince Edward County","Peller Estates Winery, Butchart Gardens","Peller Estates: 4.7 stars, Butchart Gardens: 4..."
2,1003,"National Parks, Hiking Trails","Algonquin Park, Bruce Peninsula National Park","Algonquin Park, Bruce Peninsula National Park","Algonquin Park: 5 stars, Bruce Peninsula Natio..."
3,1004,"Nature and Wildlife Areas, Zoos","Toronto, Hamilton","Toronto Zoo, Royal Botanical Gardens","Toronto Zoo: 4.6 stars, Royal Botanical Garden..."
4,1005,"Landmarks, Points of Interest","Ottawa, Kingston","Parliament Hill, Thousand Islands","Parliament Hill: 4.8 stars, Thousand Islands: ..."


## EDA


In [714]:
# df_expanded = df.assign(Category=df['Category'].str.split(', ')).explode('Category')
# df_expanded

In [715]:
# import matplotlib.pyplot as plt
# category_counts = df_expanded['Category'].value_counts()
# plt.figure(figsize=(10, 6))
# category_counts.plot(kind='bar')
# plt.title('Number of Values in Each Category')
# plt.xlabel('Category')
# plt.ylabel('Count')
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.show()

## Data Cleaning

In [716]:
#describe attractions dataframe
df_attractions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            413 non-null    int64  
 1   Name          413 non-null    object 
 2   Description   413 non-null    object 
 3   Open Timings  240 non-null    object 
 4   Location      413 non-null    object 
 5   Category      413 non-null    object 
 6   Rating        413 non-null    float64
 7   Review Count  413 non-null    int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 25.9+ KB


In [717]:
#Checking for null values
df_attractions.isnull().sum()

ID                0
Name              0
Description       0
Open Timings    173
Location          0
Category          0
Rating            0
Review Count      0
dtype: int64

In [718]:
#describing user preferences dataframe
df_user_preferences.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   User ID                    50 non-null     int64 
 1   Preferred Categories       50 non-null     object
 2   Preferred Locations        50 non-null     object
 3   Past Visit History         50 non-null     object
 4   Rating and Review History  50 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [719]:
#checking for any null_values
df_user_preferences.isnull().sum()

User ID                      0
Preferred Categories         0
Preferred Locations          0
Past Visit History           0
Rating and Review History    0
dtype: int64

In [720]:
#preprocess Attractions data 
#Dropping irrevelant columns
columns_to_drop=['Open Timings']

df_attractions=df_attractions.drop(columns=columns_to_drop)
df_attractions

Unnamed: 0,ID,Name,Description,Location,Category,Rating,Review Count
0,1,Toronto Island Park,"Toronto's outdoor haven: bike trails, picnics ...",Toronto,"Islands, Parks",4.5,8564
1,2,Canadian Warplane Heritage Museum,Canadian Warplane Heritage Museum features an ...,Hamilton,Speciality Museums,4.5,1119
2,3,Jungle Cat World Wildlife Park,Jungle Cat World Wildlife Park is a wildlife s...,Orono,"Nature and Wildlife Areas, Zoos",4.5,1081
3,4,The Grotto,The Grotto is a natural sea cave located withi...,Tobermory,"Points of Interest, Landmarks",4.5,1261
4,5,Canada Aviation and Space museum,Located on a former military air base just 5 k...,Ottawa,Speciality Museums,4.5,1454
...,...,...,...,...,...,...,...
408,409,Red Apple Rides,Book an AMAZING guided bike tour or simply ren...,Port Dover,"Historical and Heritage Tours, Bike Tours, Sig...",5.0,32
409,410,Ottawa Biplane Adventures,We offer a bird's-eye view of Ottawa's most be...,Ottawa,Air Tours,4.5,37
410,411,Tours of the 6,Step into Toronto with this 2.5 hour walking t...,Toronto,"City Tours, Walking Tours",5.0,25
411,412,Deep Roots Adventure,Deep Roots Adventure is an Outdoor Adventure b...,Harcourt,"Multi-day Tours, Self-Guided Tours and Rentals...",5.0,16


In [721]:
# #Preprocess categorical variables using one hot encoding
# attractions_features=pd.get_dummies(df_attractions, columns=['Category','Location'])
# attractions_features.head()

In [722]:
# Convert text to lowercase
df_attractions['Description'] = df_attractions['Description'].str.lower()

# Remove punctuation
df_attractions['Description'] = df_attractions['Description'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize the text
df_attractions['Description'] = df_attractions['Description'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df_attractions['Description'] = df_attractions['Description'].apply(lambda x: [word for word in x if word not in stop_words])
df_attractions['Description'].head()

0    [torontos, outdoor, bike, trails, picnics, sky...
1    [canadian, warplane, heritage, museum, feature...
2    [jungle, cat, world, wildlife, park, wildlife,...
3    [grotto, natural, sea, cave, located, within, ...
4    [located, former, military, air, base, 5, kilo...
Name: Description, dtype: object

In [723]:
# Convert tokenized lists back into strings
df_attractions['Description'] = df_attractions['Description'].apply(lambda x: ' '.join(x))
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the description column to TF-IDF vectors
tfidf_vectors = tfidf_vectorizer.fit_transform(df_attractions['Description'])

# Convert TF-IDF vectors to a DataFrame
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
tfidf_df

Unnamed: 0,10,100,1000,10000,100000,10000squarefoot,100meter,100meterhigh,101,105,...,zebu,zimarts,zimbabwean,zip,zipline,zipping,zircon,zone,zoo,été
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [724]:
from scipy.sparse import hstack
# Split categories and convert them into lists of strings
categories = df_attractions['Category'].str.split(', ')

# Initialize multi-label binarizer for categories
mlb = MultiLabelBinarizer()

# Fit and transform the split categories to one-hot encoded matrix
category_matrix = mlb.fit_transform(categories)

# Initialize one-hot encoder for locations
encoder = OneHotEncoder()

# Fit and transform the location column to one-hot encoded matrix
location_matrix = encoder.fit_transform(df_attractions[['Location']])

# Concatenate TF-IDF matrix, category matrix, and location matrix into a single features matrix
features_matrix = hstack([tfidf_df, category_matrix, location_matrix])
print(features_matrix)

# Compute cosine similarity between attractions based on the features matrix
similarity_scores = cosine_similarity(features_matrix)
#print(similarity_scores)


  (0, 346)	0.24052230704764604
  (0, 473)	0.21239505489217708
  (0, 537)	0.20072119049598577
  (0, 601)	0.40144238099197155
  (0, 891)	0.25697569480692367
  (0, 1478)	0.25697569480692367
  (0, 1607)	0.25697569480692367
  (0, 1695)	0.1396871628219615
  (0, 2133)	0.25697569480692367
  (0, 3164)	0.1344097307924616
  (0, 3233)	0.2543531747943407
  (0, 3333)	0.21239505489217708
  (0, 3391)	0.19166623779515918
  (0, 4082)	0.2288484426514547
  (0, 4308)	0.25697569480692367
  (0, 4495)	0.18101974592412562
  (0, 4500)	0.13247702994455282
  (0, 4531)	0.13063221012113493
  (0, 4686)	0.14575501811244704
  (0, 4763)	0.20072119049598577
  (1, 287)	0.24786746550458016
  (1, 300)	0.30447528382724776
  (1, 810)	0.2063225797191607
  (1, 1036)	0.21928225783295277
  (1, 1514)	0.27216966578579393
  :	:
  (388, 5251)	1.0
  (389, 5132)	1.0
  (390, 5199)	1.0
  (391, 5193)	1.0
  (392, 5194)	1.0
  (393, 5247)	1.0
  (394, 5112)	1.0
  (395, 5176)	1.0
  (396, 5176)	1.0
  (397, 5174)	1.0
  (398, 5176)	1.0
  (399, 5

In [725]:
def recommend(category, location):
    try:
        # Find attractions with the specified category and location
        attractions_indices = np.where((df_attractions['Category'].str.contains(category)) & (df_attractions['Location'] == location))[0]
        print(attractions_indices)
        #print(similarity_scores[attractions_indices])
        # Sort attractions based on cosine similarity scores
        
        sorted_indices = np.argsort(similarity_scores[attractions_indices], axis=1)
        

        #print(sorted_indices)
        # Flatten the sorted_indices array
        flattened_indices = sorted_indices.flatten()

        # Find the intersection of attractions_indices and flattened_indices
        intersection_indices = np.intersect1d(attractions_indices, flattened_indices)

        # Create a list to store the top recommended attraction indices
        top_indices = []

        # Iterate through the intersection indices to find valid attractions
        for idx in intersection_indices:
            top_indices.append(idx)
            if len(top_indices) == 5:
                break

        # Extract the top recommended attractions from the DataFrame
        top_recommendations = df_attractions.iloc[top_indices]

        # Display the top recommended attractions
        print("Top 5 recommendations for category:", category, "and location:", location)
        print(top_recommendations)

        
    except IndexError:
        print("No attractions found for the specified category and location")


In [726]:
recommend('Parks', 'Toronto')


TypeError: unsupported operand type(s) for &: 'tuple' and 'str'

## Feature Engineering


## Model

## Evaluation