In [74]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = CountVectorizer(max_features = 5000, stop_words=[""])

In [75]:
data_df = pd.read_csv("./Data2.csv")

In [76]:
data_df

Unnamed: 0,id,location,tags
0,1,Campus Hub,Commonwealth chinese rice noodles shrimp fried...
1,2,Chaleur Restaurant,Statistics jollof rice banku okro soup pepper ...
2,3,Commonwealth Hall,chinese rice noodles shrimp fried grilled chic...
3,4,Statistics Department,Chaleur Restaurant jollof rice banku okro soup...
4,5,Royal Mic Jean,N-Block balls Botany sausage indomie meat pie...
5,6,Botany Department,gizzard N-Block sausage indomie meat pie cor...
6,7,N-Block,fried black pepper fufu omotuo palmnut groundn...
7,8,Volta Hall,kenkey sardine fish tilapia green black pepper...
8,9,Legon Hall,waakye chicken sardine rice jollof fried meat ...
9,10,Alpha food,bush canteen banku omotuo fufu akple konkonte


In [77]:
data_df.info()
# Print the info of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        40 non-null     int64 
 1   location  40 non-null     object
 2   tags      40 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.1+ KB


In [78]:
data_df.isnull().sum()

id          0
location    0
tags        0
dtype: int64

In [52]:
def remove_spaces(text):
    list_form = text.split(" ")
    for i in range(len(list_form)):
        if list_form[i]==" ":
            list_form[i]=""
    

<p>Tokenization is the process of breaking down a text into smaller units called tokens. These tokens can be words, phrases, symbols, or other elements of the text that are relevant for a particular NLP task. Tokenization is a crucial step in text processing and analysis, as it allows the individual components of the text to be analyzed and manipulated.

For example, in the sentence "The quick brown fox jumps over the lazy dog.", the tokens would be ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]. These tokens can then be used as the basic building blocks for various NLP tasks such as text classification, sentiment analysis, and others.
</p>

<p>CountVectorizer is a tool in scikit-learn for converting a collection of text documents to a matrix of token counts. It tokenizes the input text into individual words (tokens) and then calculates the frequency of each token in each document. The resulting matrix is commonly referred to as a "bag of words" representation of the text data, and can then be used as input for various machine learning algorithms. The CountVectorizer has options for lowercasing, removing stop words, and n-grams that allow for further customization of the tokenization process. 
</p>

In [79]:
data_df.head()

Unnamed: 0,id,location,tags
0,1,Campus Hub,Commonwealth chinese rice noodles shrimp fried...
1,2,Chaleur Restaurant,Statistics jollof rice banku okro soup pepper ...
2,3,Commonwealth Hall,chinese rice noodles shrimp fried grilled chic...
3,4,Statistics Department,Chaleur Restaurant jollof rice banku okro soup...
4,5,Royal Mic Jean,N-Block balls Botany sausage indomie meat pie...


In [80]:
vectors = vectorizer.fit_transform(data_df["tags"]).toarray()


In [81]:
vectors[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0], dtype=int64)

In [82]:
similarity = cosine_similarity(vectors)

In [83]:
distances = similarity[2]

restaurant_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]

In [84]:
restaurant_list

[(0, 0.7273929674533078),
 (38, 0.5238095238095238),
 (39, 0.511766315719159),
 (36, 0.4652421051992354),
 (37, 0.44543540318737407),
 (8, 0.3704792868174742),
 (20, 0.3289758474798845),
 (21, 0.24209101306752098),
 (13, 0.23145502494313785)]

In [59]:
for (index,sim) in restaurant_list:
        print(data_df["location"][index])

Commonwealth Hall
Sha-shill
ElizabethSey
Village Canteen
Akuafo Hall
Legon Hall
Meluvs
Easy eats Pork chops
Pleasure palace


In [60]:
def recommend(location):
    to_return=[]
    try:
        vectors = vectorizer.fit_transform(data_df["tags"]).toarray()
        similarity = cosine_similarity(vectors)
        location_index=data_df[data_df["location"]==location].index[0]
        distances = similarity[location_index]
        restaurant_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]
        for (index,sim) in restaurant_list:
            to_return.append(data_df["location"][index])
    except Exception:
        return("Location can not be found")
    return to_return

In [61]:
print(recommend("Night Market"))

["Fingerlickin'", 'Daavi feli', 'Eden Breakfast', 'Easy eats Pork chops', 'Favoured One indomie', 'Sha-shill', 'ElizabethSey', 'Hajia Special Koko', 'Commonwealth Hall']
