<a id='menu'></a>
# Menu

[1. Data Processing](#data_proc)

[2. Geolocalisation](#geo_loc)

[3. Popularity](#pop)

[4. Content Based](#content)

[5. Collaborative Filtering](#cf)

- [5.1 Model Based SVD](#cf_model)
    
- [5.2 Memory Based Memory](#cf_memory)
    

# Libraries

**Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

**Word2Vec**

In [2]:
from nltk.tokenize import word_tokenize
import gensim 
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Visualization**

In [3]:
import folium
import html

In [4]:
import time

**Path**

In [5]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

In [6]:
api_path = "/home/hongphuc95/notebookteam/api/"

In [2]:
import sys
pathModulesES = '../sauceforyall/'
sys.path.append(pathModulesES)
from yelpquery import YelpQuery
from pandasticsearch import Select
ye = YelpQuery()

In [3]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

<a id='data_proc'></a>
# 1. Load data

[Back to menu](#menu)

In [9]:
business_df = pd.read_json(data_path + "business.json", lines=True)

In [10]:
business_df = business_df.dropna(subset=["categories"])

In [11]:
#review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019.json", lines=True)

## 1.1 Useful functions

In [12]:
def business_details(business_df, review_df):
    business_df = business_df[["business_id", "name", "categories", "address", "city", "state", "latitude", "longitude", "stars", "review_count"]]
    new_df = pd.merge(business_df, review_df, how="inner", on="business_id")
    return new_df

In [13]:
def show_map(loc, radius=2000, popularity=None, contentbased=None, cf=None):
    lat = loc["latitude"]
    long = loc["longitude"]
    mp = folium.Map(location=[lat, long], zoom_start=12, radius=200000, fill_color='#3186cc', line_color='#3186cc')
    
    if popularity is not None:
        for _, r in popularity.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='blue')).add_to(mp)

    if contentbased is not None:
        for _, r in contentbased.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='orange')).add_to(mp)
            
    if cf is not None:
        for _, r in cf.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='red')).add_to(mp)
            
    return mp

<a id='geo_loc'></a>
# 2. Geolocalisation

[Back to menu](#menu)

In [14]:
from geopy.exc import GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable
from geopy.geocoders import Nominatim
from math import radians, cos, sin, asin, sqrt
import requests


class Geolocation:
    token = "5e599797dbecfc222d30063da4b86640"
    send_url = "http://api.ipstack.com/check?access_key=" + token

    def __init__(self, business_df):
        self.business_df = business_df
        self.location = {}
        self.distance_df = pd.DataFrame()
        self.geolocator = Nominatim(user_agent="Data_Dive_Prod")
        
    def normalize(self, df):
        dataNorm = ((df - df.min()) / (df.max() - df.min()))
        return dataNorm

    def reset(self):
        self.location = {}
        self.distance_df = pd.DataFrame()

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formule
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of earth in kilometers.
        res = c * r
        return np.round(res, 4)

    def show_current_location(self):
        logger.info("Current location: (Lat: %s, Long: %s)" % (self.location["latitude"], self.location["longitude"]))

    def get_current_location(self):
        return self.location

    def get_business_nearby(self):
        return self.distance_df

    def get_info_coordinate(self, coordinate):
        gps = str(coordinate["latitude"]) + ',' + str(coordinate["longitude"])
        location = self.geolocator.reverse(gps)
        self.location["city"] = location.raw["address"]["city"]
        self.location["country"] = location.raw["address"]["country"]
        self.show_current_location()

    def get_coordinate_address(self, address):
        location = None
        try:
            location = self.geolocator.geocode(address)
        except (GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable):
            time.sleep(1)
            try:
                location = self.geolocator.geocode(address)
            except (
                    GeocoderTimedOut, GeocoderServiceError,
                    GeocoderUnavailable):
                logger.info('GeocoderServiceError occored')
                return None, None

        logger.info(location)
        if location:
            self.location["latitude"] = location.latitude
            self.location["longitude"] = location.longitude
            coordinate = {"latitude": location.latitude,
                          "longitude": location.longitude}
            self.get_info_coordinate(coordinate=coordinate)

    def get_coordinate_ip_address(self):
        geo_req = requests.get(self.send_url)
        response = geo_req.json()
        self.location["latitude"] = float(response["latitude"])
        self.location["longitude"] = float(response["longitude"])
        self.location["city"] = response["city"]
        self.location["country"] = response["country_name"]
        self.show_current_location()

    def get_neighbors_recommend(self, lookup="", engine=True, rec_range=5):

        if engine:
            if not lookup:
                self.get_coordinate_ip_address()
            else:
                self.get_coordinate_address(lookup)
        else:
            self.location = lookup
            self.get_info_coordinate(coordinate=lookup)

        self.distance_df = pd.DataFrame()
        self.distance_df = self.business_df[["business_id", "longitude", "latitude"]]
        self.distance_df["distance"] = self.distance_df.apply(
            lambda x: self.haversine_distance(self.location["latitude"], self.location["longitude"], x["latitude"],
                                              x["longitude"]),
            axis=1)
        self.distance_df = self.distance_df.sort_values(ascending=True, by="distance").reset_index(drop=True)
        self.distance_df = self.distance_df[self.distance_df["distance"] <= rec_range][["business_id", "distance"]]

        # Normalization
        self.distance_df["geo_score"] = self.normalize(self.distance_df["distance"])
        self.distance_df["geo_score"] = 1 - self.distance_df["geo_score"]

In [15]:
class EngineGeo:

    def __init__(self, business_df):
        self.geoloc = Geolocation(business_df=business_df)
        
    def reset(self):
        self.geoloc.reset()

    def get_business_nearby(self):
        return self.geoloc.get_business_nearby()

    def get_current_location(self):
        return self.geoloc.get_current_location()

    def recommend(self, lookup="", engine=True, rec_range=5):
        self.geoloc.get_neighbors_recommend(lookup=lookup, engine=engine, rec_range=rec_range)

In [16]:
engine_geo = EngineGeo(business_df)

In [17]:
#engine_geo.get_coordinate_address("Las Vegas")

In [18]:
engine_geo.recommend(lookup="Las Vegas", engine=True)

INFO:__main__:Las Vegas, Clark County, Nevada, United States of America
INFO:__main__:Current location: (Lat: 36.1672559, Long: -115.1485163)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [19]:
engine_geo.get_current_location()

{'latitude': 36.1672559,
 'longitude': -115.1485163,
 'city': 'Las Vegas',
 'country': 'United States of America'}

In [20]:
business_nearby = engine_geo.get_business_nearby()

In [21]:
business_nearby

Unnamed: 0,business_id,distance,geo_score
0,3fdtp-bzoE4ZgTakkcEBzQ,0.0218,1.000000
1,Vyadl8RsxaFaAFjm98lNTQ,0.0218,1.000000
2,VhazKK6zbHrdJaN-bmeHbQ,0.0361,0.997127
3,j5pWQfzFuJdYUXb-vKHgyA,0.1157,0.981137
4,jmketuCDahSV1-47orzdMg,0.1164,0.980997
...,...,...,...
5197,B19KKE75ZxqoR3EDuvA1qw,4.9921,0.001567
5198,cPtybPHvMvZygjLkgvM4GA,4.9930,0.001386
5199,FhqoXrpfw5ji_Qoh4GcJMg,4.9944,0.001105
5200,PZ3qgjUMg8akZaP0CssYNA,4.9997,0.000040


<a id='pop'></a>
# 3. Popularity Based

[Back to menu](#menu)

## 3.1 Methods

In [28]:
class Popularity:

    def __init__(self, business_df, ye):
        self.business_df = business_df
        self.ye = ye

    def get_model_name(self):
        return self.MODEL_NAME
    
    def normalize(self, df):
        dataNorm = ((df - df.min()) / (df.max() - df.min()))
        return dataNorm

    def recommend(self, top_n=50, filters={}, geo_w=0.5, pop_w=0.5):
        mask = np.array([])
        recommendations_df = pd.DataFrame()
        nearby_df = pd.DataFrame()
        if filters:
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if mask.size == 0:
                    mask = nearby_ids
                else:
                    mask = np.intersect1d(mask, nearby_ids)

            if "categories" in filters and filters["categories"]:
                mask_cat = self.business_df[self.business_df["categories"].str.contains(filters["categories"])][
                    "business_id"].values
                mask = np.intersect1d(mask, mask_cat)

            # review_df = self.review_df[self.review_df["business_id"].isin(mask)]
            if mask.size > 0:
                mustArray = [
                    self.ye.bodyMultivalueTerm("business_id.keyword", np.unique(mask).tolist()),
                    self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31")
                ]
                review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review*",
                                                              mustArray=mustArray, filterArray=[],
                                                              exclude=["text", "@timestamp", "@version", "cool",
                                                                       "useful",
                                                                       "funny"], size=2000)

                print("Total reviews retrieved: %d" % (review_fisrt_chunk["hits"]["total"]["value"]))
                review_df = self.ye.getResultScrolling(review_fisrt_chunk)

                recommendations_df = review_df.groupby("business_id")["stars"] \
                    .agg(["sum", "count"]) \
                    .reset_index()

                recommendations_df["ratings_avg"] = (recommendations_df["sum"] / recommendations_df["count"])
                recommendations_df = recommendations_df \
                    .sort_values(ascending=False, by="ratings_avg") \
                
                #Normalization of popularity score
                recommendations_df["score"] = self.normalize(recommendations_df["ratings_avg"])
                

                if not nearby_df.empty:
                    recommendations_df = pd.merge(left=recommendations_df, right=nearby_df, how="inner",
                                                  on="business_id")
                    recommendations_df = recommendations_df.rename(
                        columns={"score": "pop_score"})

                    recommendations_df["geo_score"] = recommendations_df["geo_score"] * geo_w
                    recommendations_df["pop_score"] = recommendations_df["pop_score"] * pop_w
                    recommendations_df["score"] = recommendations_df["geo_score"] + recommendations_df["pop_score"]

                    recommendations_df = recommendations_df.sort_values(ascending=False, by="score")

                recommendations_df = recommendations_df.head(top_n)

        return recommendations_df

In [29]:
class EnginePopularity:

    def __init__(self, business_df, ye):
        self.pop = Popularity(business_df=business_df, ye=ye)

    def predict(self, top_n=50, filters={}):
        return self.pop.recommend(top_n=top_n, filters=filters)

## 3.2 Test

In [30]:
engine_pop = EnginePopularity(business_df, ye)

In [31]:
filters={}
filters["nearby"] = business_nearby
filters["categories"] = "Hotels"

In [32]:
rec_pop = engine_pop.predict(filters=filters, top_n=50)

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-review*/_search?_source_excludes=text%2C%40timestamp%2C%40version%2Ccool%2Cuseful%2Cfunny&_source_includes=&scroll=1m&size=2000 [status:200 request:5.004s]


Total reviews retrieved: 18844


INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:5.918s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:4.797s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:4.236s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:1.990s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.537s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.609s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.574s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.220s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.137s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 reques

In [33]:
rec_pop

Unnamed: 0,business_id,sum,count,ratings_avg,pop_score,distance,geo_score,score
38,fY0RCsymg465GQ7tmxLYog,84.0,17,4.941176,0.492647,0.4559,0.456399,0.949046
6,ff-bUaqzSnyAOzW_Qtd65A,65.0,13,5.0,0.5,0.5594,0.446003,0.946003
9,8gYkL76weP9dmd-W8NOeqw,10.0,2,5.0,0.5,0.5915,0.442779,0.942779
27,OIc8vVH1MTmObFeeoc0LFA,10.0,2,5.0,0.5,0.6687,0.435025,0.935025
14,ovFnrmsT5rUpbAQE1r-V0Q,70.0,14,5.0,0.5,0.6738,0.434513,0.934513
20,LklurWnK8agxFp6G-v1CMg,35.0,7,5.0,0.5,0.7311,0.428758,0.928758
19,A_lTCKAfKQKhUiOUyx8-3A,5.0,1,5.0,0.5,0.8015,0.421687,0.921687
13,fAU948kCaESzxAD_nF7rYw,5.0,1,5.0,0.5,0.835,0.418322,0.918322
1,uAX_EAU1r6J3JwPeX7uGrA,15.0,3,5.0,0.5,0.835,0.418322,0.918322
48,cKgkSMcPXwWTzPrJRpa2qw,128.0,27,4.740741,0.467593,0.835,0.418322,0.885915


**Merge business informations**

In [34]:
rec_pop = business_details(business_df, rec_pop)

In [35]:
rec_pop

Unnamed: 0,business_id,name,categories,address,city,state,latitude,longitude,stars,review_count,sum,count,ratings_avg,pop_score,distance,geo_score,score
0,j6D75GAJu6LdvRR1_OsxnA,Bridger Inn,"Hotels, Event Planning & Services, Hotels & Tr...",301 S Main St,Las Vegas,NV,36.169008,-115.147637,3.5,10,28.0,8,3.5,0.3125,0.2102,0.481077,0.793577
1,ovFnrmsT5rUpbAQE1r-V0Q,Segway Las Vegas,"Tours, Motorcycle Rental, Hotels & Travel, Act...",901 S Main St,Las Vegas,NV,36.161839,-115.151881,5.0,38,70.0,14,5.0,0.5,0.6738,0.434513,0.934513
2,ER7lT5gKBN-WpNkSIChQAQ,Enterprise Rent-A-Car,"Hotels & Travel, Car Rental",301 Fremont St,Las Vegas,NV,36.170154,-115.142871,3.0,16,41.0,13,3.153846,0.269231,0.6005,0.441875,0.711106
3,i42M5aLwhsCSRD0eWrp50g,Cowboy Trail Rides,"Hotels & Travel, Tours",1211 S Eastern Ave,Las Vegas,NV,36.156127,-115.118651,5.0,5,15.0,3,5.0,0.5,2.953,0.20559,0.70559
4,MYD64NGYbF0n7sQZ-I4o5g,Las Vegas Club Hotel & Casino,"Casinos, Hotels & Travel, Event Planning & Ser...",18 E Fremont St,Las Vegas,NV,36.172075,-115.145689,2.0,97,7.0,2,3.5,0.3125,0.593,0.442629,0.755129
5,EUWBT5GDxPC95w9itZ1EHw,Garden Court Buffet,"Restaurants, Hotels, Event Planning & Services...",200 N Main St,Las Vegas,NV,36.174097,-115.14511,3.5,556,811.0,235,3.451064,0.306383,0.8199,0.419839,0.726222
6,AZ4JC4-YOIWhOOOSo0AcMw,Taxi Service,"Taxis, Tours, Hotels & Travel, Wine Tours, Tra...",,Las Vegas,NV,36.169941,-115.13983,4.0,12,38.0,10,3.8,0.35,0.835,0.418322,0.768322
7,PWFB8WyJGaFBWfcrwcn9Lw,Premier Club Tours,"Adult Entertainment, Nightlife, Bars, Party Bu...",100 S Las Vegas Blvd,Las Vegas,NV,36.168946,-115.141032,4.5,47,137.0,32,4.28125,0.410156,0.6976,0.432123,0.842279
8,g83WbX_recywc4DEIZ-xug,Neon Museum,"Hotels & Travel, Shopping, Arts & Entertainmen...",770 North Las Vegas Blvd,Las Vegas,NV,36.176986,-115.135346,4.5,998,2549.0,597,4.269682,0.40871,1.6025,0.341235,0.749945
9,h_A7TFaKasHmrc9s_hhb5w,Place on 7th,"Hotels & Travel, Event Planning & Services, Ho...",115 7th St,Las Vegas,NV,36.168895,-115.13757,4.0,10,39.0,10,3.9,0.3625,0.9994,0.40181,0.76431


In [36]:
show_map(loc=engine_geo.get_current_location(), popularity=rec_pop)

<a id='content'></a>
# 4. Content Based Keyword

[Back to menu](#menu)

## 4.1 Methods

In [22]:
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics.pairwise import cosine_similarity

data_path = "/home/hongphuc95/notebookteam/dataset/"
api_path = "/home/hongphuc95/notebookteam/api/"

class ContentExtact:

    def __init__(self, ye, business_df):
        self.ye = ye
        self.business_df = business_df

    ##########################################################
    #                     Loading tools                      #
    ##########################################################
    def load(self):
        self.load_text_to_vec()
        self.load_word2vec()

    ##########################################################
    #                    Text Processing                     #
    ##########################################################

    def clean_text(self, text):
        all_stopwords = stopwords.words('english')
        text_tokens = word_tokenize(text.lower())
        tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
        return tokens_without_sw

    def text_processing(self):
        logger.info("Cleaning text in progress...")
        start = time.time()
        review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019.json", lines=True)
        review_by_business = review_df.groupby('business_id')['text'].agg(lambda col: ' '.join(col)).reset_index()
        review_by_business["text"] = review_by_business["text"].apply(lambda x: self.clean_text(x))
        logger.info("Done cleaning text in %s seconds." % (time.time() - start))
        return review_by_business

    ##########################################################
    #                        Word2Vec                        #
    ##########################################################
    def train_word2vec(self, review_by_business=None, save=True):
        logger.info("Training Word2Vec Model in progress...")
        start = time.time()
        model = gensim.models.Word2Vec(review_by_business["text"], min_count=5, size=200, workers=4)
        logger.info("Done Word2Vec text in %s seconds." % (time.time() - start))
        self.model = model
        if save:
            self.save_word2vec(self.model)

    def save_word2vec(self, model):
        model.save(api_path + "trained/review_full.model")

    def load_word2vec(self):
        try:
            self.model = Word2Vec.load(api_path + "trained/review_full.model")
            logger.info("Reviews vectorized loaded")
        except (FileNotFoundError, IOError):
            self.model = None
            logger.info("File not found")

    ##########################################################
    #                        TextToVec                       #
    ##########################################################

    def text_to_vec(self, save=True):
        logger.info("Convert texts to vectors in progress...")
        start = time.time()
        self.load_word2vec()
        review_by_business = self.text_processing()

        if not self.model:
            self.train_word2vec(review_by_business=review_by_business, save=True)

        # Clean DF
        review_by_business["text_vec"] = review_by_business["text"].apply(
            lambda x: self.avg_feature_vector(x, model=self.model, n_features=200))

        review_by_business.drop("text", axis=1, inplace=True)
        if save:
            self.save_text_to_vec(review_by_business)
        self.docvecs = review_by_business
        logger.info("Done converting texts to vectors in %s seconds." % (time.time() - start))


    def save_text_to_vec(self, review_by_business):
        with open(api_path + "trained/review_2016_2019_full_vectorized.pickle", "wb") as f:
            pickle.dump(review_by_business, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_text_to_vec(self):
        try:
            with open(api_path + "trained/review_2016_2019_full_vectorized.pickle", "rb") as f:
                self.docvecs = pickle.load(f)
                logger.info("Reviews vectorized loaded")
        except (FileNotFoundError, IOError):
            self.text_to_vec()

    def avg_feature_vector(self, sentence, model, n_features):
        index2word_set = set(model.wv.index2word)
        feature_vec = np.zeros((n_features,), dtype='float32')
        n_words = 0
        for word in sentence:
            if word in index2word_set:
                n_words += 1
                feature_vec = np.add(feature_vec, model.wv[word])
        if (n_words > 0):
            feature_vec = np.divide(feature_vec, n_words)
        return feature_vec

    ##########################################################
    #                        Recommend                       #
    ##########################################################

    def keyword_recommend(self, input_str, top_n=20, filters={}, content_w=0.8, geo_w=0.2):
        docvecs = self.docvecs
        nearby_df = pd.DataFrame()

        if filters:
            mask = np.array([])
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if mask.size == 0:
                    mask = nearby_ids
                else:
                    mask = np.intersect1d(mask, nearby_ids)

            if "categories" in filters and filters["categories"]:
                mask_cat = self.business_df[self.business_df["categories"].str.contains(filters["categories"])][
                    "business_id"].values
                if mask.size == 0:
                    mask = mask_cat
                else:
                    mask = np.intersect1d(mask, mask_cat)

            docvecs = docvecs[docvecs["business_id"].isin(mask)]

        from nltk.tokenize import word_tokenize
        input_vec = pd.DataFrame({"text": [input_str]})
        input_vec["text"] = input_vec["text"].apply(lambda x: word_tokenize(x.lower()))
        input_vec["text_vec"] = input_vec["text"].apply(
            lambda x: self.avg_feature_vector(x, model=self.model, n_features=200))
        business_similarity = docvecs[["business_id", "text_vec"]]

        # compute similarity array
        business_similarity["score"] = business_similarity["text_vec"].apply(
            lambda x: cosine_similarity([x], [input_vec["text_vec"].values[0]])[0][0])
        business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False,
                                                                                         by="score")
        print("Shape business similarity before: %s" % (business_similarity.shape))
        if "nearby" in filters and not filters["nearby"].empty:
            business_similarity = business_similarity.rename(columns={"score": "content_score"})
            business_similarity = pd.merge(left=business_similarity, right=nearby_df, how="inner", on="business_id")
            business_similarity["score"] = content_w * business_similarity["content_score"] + geo_w * business_similarity["geo_score"]
            business_similarity = business_similarity.sort_values(by="score", ascending=False)            
            print("Shape business similarity after: %s" % (business_similarity))
            
        business_similarity = business_similarity.head(top_n)
        return business_similarity

    def business_similarity(self, business_ids, top_n, filters={}, content_w=0.8, geo_w=0.2):

        docvecs = self.docvecs
        businesses_similarity = pd.DataFrame()
        nearby_df = pd.DataFrame()

        mask = np.array([])
        if filters:
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if mask.size == 0:
                    mask = nearby_ids
                else:
                    mask = np.intersect1d(mask, nearby_ids)

            if "categories" in filters and filters["categories"]:
                mask_cat = self.business_df[self.business_df["categories"].str.contains(filters["categories"])][
                    "business_id"].values
                business_ids = np.intersect1d(business_ids, mask_cat)
                print("hey")
                print(business_ids)
                if mask.size == 0:
                    mask = mask_cat
                else:
                    mask = np.intersect1d(mask, mask_cat)

        # Filter businesses in the area
        print("Mask size : %s" % (mask.size))
        if mask.size > 0:
            mask = np.append(mask, business_ids)
            docvecs = docvecs[docvecs["business_id"].isin(mask)]
        elif mask.size == 0:
            mask = business_ids
            docvecs = docvecs[docvecs["business_id"].isin(mask)]
        
        #When user has none of the review of the category that we're looking for
        if business_ids.size > 0:
            business_ids = business_ids[:5]

            for business_id in business_ids:
                business_similarity = pd.DataFrame()
                input_vec = docvecs[docvecs["business_id"] == business_id]
                business_similarity = docvecs[["business_id", "text_vec"]]

                # compute similarity array
                business_similarity["score"] = business_similarity["text_vec"].apply(
                    lambda x: cosine_similarity([x], [input_vec["text_vec"].values[0]])[0][0])
                business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False,
                                                                                                 by="score")
                # Filter business with the same id
                business_similarity = business_similarity[~business_similarity["business_id"].isin(business_ids)]
                business_similarity["input_business_id"] = business_id

                if "nearby" in filters and not filters["nearby"].empty:
                    business_similarity = business_similarity.rename(columns={"score": "content_score"})
                    business_similarity = pd.merge(left=business_similarity, right=nearby_df, how="inner", on="business_id")
                    business_similarity["score"] = content_w * business_similarity["content_score"] + geo_w * \
                                                   business_similarity["geo_score"]
                    business_similarity = business_similarity.sort_values(by="score", ascending=False)
                business_similarity = business_similarity.head(top_n)

                businesses_similarity = businesses_similarity.append(business_similarity)

        return businesses_similarity

    def content_recommend(self, user_id, top_n, filters={}, content_w=0.8, geo_w=0.2):
        docvecs = self.docvecs

        mustArray = [
            self.ye.bodySingleMatch("user_id", user_id),
            self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31"),
            self.ye.bodyRange("stars", gteValue="3")
        ]
        review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review*",
                                                 mustArray=mustArray, filterArray=[],
                                                 exclude=["text", "@timestamp", "@version", "cool", "useful",
                                                          "funny"], size=2000)


        print("Total reviews retrieved: %d" % (review_fisrt_chunk["hits"]["total"]["value"]))
        review_df = self.ye.getResultScrolling(review_fisrt_chunk)
        review_df = review_df.sort_values(by="date", ascending=False)
        #review_df = alreadyreviwed(user_id=user_id, review_df=self.review_df)
        #review_df = review_df[review_df["stars"] >= 3].sort_values(by="date", ascending=False)

        # print("Businesses previously reviewed by user: ")
        # already_reviewed = business_details(self.business_df, review_df)
        # print(already_reviewed[["name", "business_id", "categories"]])

        business_reviewed_ids = review_df["business_id"].unique()
        sim_business_df = self.business_similarity(business_reviewed_ids, top_n=top_n, filters=filters, content_w=content_w, geo_w=geo_w)
        return business_reviewed_ids, sim_business_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
class EngineCB:

    def __init__(self, ye, business_df):
        logger.info("Initilizing Content Based Engine")
        self.content = ContentExtact(ye=ye, business_df=business_df)

    def load(self):
        self.content.load()

    def train_word2vec(self):
        self.train_word2vec()

    def train_texttovec(self):
        self.content.text_to_vec()

    def keyword_recommend(self, input_str, top_n=10, filters={}):
        return self.content.keyword_recommend(input_str=input_str, top_n=top_n, filters=filters)

    def content_recommend(self, user_id, top_n=10, filters={}):
        return self.content.content_recommend(user_id=user_id, top_n=top_n, filters=filters)

## 4.2 Test

In [24]:
#hm = engine_cb.content_recommend(user_id="EC5nxNCWCmjHg1F14WrlxQ", nearby_businesses=business_ids_nearby, top_n=10)

In [25]:
filters={}
filters["nearby"] = business_nearby
filters["categories"] = "Tea"

In [26]:
engine_cb = EngineCB(ye=ye, business_df=business_df)

INFO:__main__:Initilizing Content Based Engine


In [27]:
engine_cb.load()

INFO:__main__:Reviews vectorized loaded
INFO:gensim.utils:loading Word2Vec object from /home/hongphuc95/notebookteam/api/trained/review_full.model
INFO:gensim.utils:loading wv recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.wv.* with mmap=None
INFO:gensim.utils:loading vectors from /home/hongphuc95/notebookteam/api/trained/review_full.model.wv.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loading vocabulary recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.vocabulary.* with mmap=None
INFO:gensim.utils:loading trainables recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.trainables.* with mmap=None
INFO:gensim.utils:loading syn1neg from /home/hongphuc95/notebookteam/api/trained/review_full.model.trainables.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded /home/hongphuc95/notebook

In [28]:
#rec_cb = engine_cb.keyword_recommend(input_str="tacos", top_n=20, filters=filters)

In [29]:
reviewed_df, rec_cb = engine_cb.content_recommend(user_id="EC5nxNCWCmjHg1F14WrlxQ", top_n=20, filters=filters)

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-review*/_search?_source_excludes=text%2C%40timestamp%2C%40version%2Ccool%2Cuseful%2Cfunny&_source_includes=&scroll=1m&size=2000 [status:200 request:0.121s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.014s]


Total reviews retrieved: 44
hey
['DYAorbxOyubUB_wtQRCdug' 'eS76100l3h7Ollb5s3_M4A'
 'hIUKufhwR6Ifn7bi0-phLA' 'hoskgfXQRZXc9HV48nVFvg'
 's_Qrhr4aJnRX-m44DxYl_Q' 'yooVq4aUUthr7DOQiZ7yEA']
Mask size : 166


In [30]:
rec_cb[rec_cb["input_business_id"] == "DYAorbxOyubUB_wtQRCdug"]

Unnamed: 0,business_id,content_score,input_business_id,distance,geo_score,score
10,5iHctUjkQTGwEvOaBkwMRQ,0.934026,DYAorbxOyubUB_wtQRCdug,0.2408,0.956007,0.938422
2,Xspg78dOvuedvuPEUwZBjw,0.961518,DYAorbxOyubUB_wtQRCdug,0.8279,0.838071,0.936829
16,xnVkYE3iMp_aZniiCIuD0g,0.921127,DYAorbxOyubUB_wtQRCdug,0.4393,0.916133,0.920128
3,CoyeXg8FBsS_d20QzNIy-A,0.954577,DYAorbxOyubUB_wtQRCdug,1.1497,0.773428,0.918347
6,4Nj2ktP2NjCbv2aKeH3ZBg,0.950618,DYAorbxOyubUB_wtQRCdug,1.0789,0.78765,0.918025
9,be_HDTc0Gy6dKWoWq_o7UQ,0.934522,DYAorbxOyubUB_wtQRCdug,0.7659,0.850525,0.917723
12,0WPXDs4ini3D7jpiy1Daqw,0.931912,DYAorbxOyubUB_wtQRCdug,0.8504,0.833551,0.91224
19,L3dAvYkqMsLhyPLoeJgwsQ,0.916128,DYAorbxOyubUB_wtQRCdug,0.58,0.887869,0.910476
5,0EgYXYjt2XJL4hlsKnzrcw,0.950762,DYAorbxOyubUB_wtQRCdug,1.4074,0.721661,0.904942
15,5B_Uo-PCaWu9Jio9BWHKXw,0.921425,DYAorbxOyubUB_wtQRCdug,0.9137,0.820835,0.901307


In [172]:
rec_cb = business_details(business_df, rec_cb)

In [173]:
rec_cb.head(40)

Unnamed: 0,business_id,name,categories,address,city,state,latitude,longitude,stars,review_count,content_score,input_business_id,distance,geo_score,score
0,K4z0ODOZNuN2uRnL4IwApw,Starbucks,"Food, Coffee & Tea","129 East Fremont Street, Golden Nugget Las Veg...",Las Vegas,NV,36.170871,-115.14492,3.0,79,0.8605,DYAorbxOyubUB_wtQRCdug,0.5156,0.900806,0.868561
1,K4z0ODOZNuN2uRnL4IwApw,Starbucks,"Food, Coffee & Tea","129 East Fremont Street, Golden Nugget Las Veg...",Las Vegas,NV,36.170871,-115.14492,3.0,79,0.841054,eS76100l3h7Ollb5s3_M4A,0.5156,0.900806,0.853004
2,K4z0ODOZNuN2uRnL4IwApw,Starbucks,"Food, Coffee & Tea","129 East Fremont Street, Golden Nugget Las Veg...",Las Vegas,NV,36.170871,-115.14492,3.0,79,0.897118,s_Qrhr4aJnRX-m44DxYl_Q,0.5156,0.900806,0.897855
3,fOq7DDqqsVm2XOFSL925XA,Starbucks,"Food, Coffee & Tea",122 East Clark Ave,Las Vegas,NV,36.165913,-115.14752,3.5,33,0.836012,s_Qrhr4aJnRX-m44DxYl_Q,0.1741,0.969406,0.862691
4,IDBG5UhyqWU4YqB1MZZtNg,Sambalatte,"Coffee & Tea, Food, Coffee Roasteries",100 N City Pkwy,Las Vegas,NV,36.173917,-115.148725,4.0,6,0.894455,DYAorbxOyubUB_wtQRCdug,0.7409,0.855547,0.886673
5,IDBG5UhyqWU4YqB1MZZtNg,Sambalatte,"Coffee & Tea, Food, Coffee Roasteries",100 N City Pkwy,Las Vegas,NV,36.173917,-115.148725,4.0,6,0.888969,eS76100l3h7Ollb5s3_M4A,0.7409,0.855547,0.882285
6,IDBG5UhyqWU4YqB1MZZtNg,Sambalatte,"Coffee & Tea, Food, Coffee Roasteries",100 N City Pkwy,Las Vegas,NV,36.173917,-115.148725,4.0,6,0.894253,s_Qrhr4aJnRX-m44DxYl_Q,0.7409,0.855547,0.886512
7,l4E8kiB13FWV1p_rGYJLig,Bao Now,"Food, Coffee & Tea, Restaurants, Dim Sum, Chinese",300 W Sahara Ave,Las Vegas,NV,36.144152,-115.160106,3.5,40,0.885195,hoskgfXQRZXc9HV48nVFvg,2.7717,0.4476,0.797676
8,6UI3dCIY6eM1eBOsnEVBuw,The Perq,"Bakeries, Restaurants, Nightlife, Dance Clubs,...",2535 S Las Vegas Blvd,Las Vegas,NV,36.142369,-115.156034,3.5,104,0.975088,DYAorbxOyubUB_wtQRCdug,2.8484,0.432193,0.866509
9,6UI3dCIY6eM1eBOsnEVBuw,The Perq,"Bakeries, Restaurants, Nightlife, Dance Clubs,...",2535 S Las Vegas Blvd,Las Vegas,NV,36.142369,-115.156034,3.5,104,0.983688,eS76100l3h7Ollb5s3_M4A,2.8484,0.432193,0.873389


In [None]:
show_map(loc=simulation_loc, contentbased=rec_content)

<a id='cf'></a>
# 5. Collaorative Filtering

[Back to menu](#menu)

## 5.1 Model Based SVD
<a id='cf_model'></a>

In [22]:
review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019_Vegas.json", lines=True)

In [23]:
#review_df = pd.read_csv("/home/hongphuc95/notebookteam/fu/review.csv")

In [24]:
from scipy import sparse
from scipy.sparse.linalg import svds
from sklearn.preprocessing import LabelEncoder

class CustomSVD:
    def __init__(self, review_df):
        self.user_active_df = self.get_active_user(review_df)

    def get_active_user(self, review_df):
        n_active = 30
        user_review_df_count = review_df.groupby("user_id").size()
        user_review_active_values = user_review_df_count[user_review_df_count >= n_active].reset_index()[
            "user_id"].values
        user_active_df = review_df[review_df["user_id"].isin(user_review_active_values)]
        user_active_df = user_active_df.groupby(["user_id", "business_id"], as_index=False).mean()
        return user_active_df

    def create_utility_matrix(self, df):
        user_encoder = LabelEncoder()
        business_encoder = LabelEncoder()

        user_ids = pd.DataFrame(columns=["user_id_matrix"])
        business_ids = pd.DataFrame(columns=["business_id_matrix"])
        df["user_id_matrix"] = user_encoder.fit_transform(df['user_id'])
        df["business_id_matrix"] = business_encoder.fit_transform(df['business_id'])

        user_ids["user_id_matrix"] = df["user_id_matrix"].unique()
        business_ids["business_id_matrix"] = df["business_id_matrix"].unique()
        user_ids["user_id"] = user_encoder.inverse_transform(user_ids["user_id_matrix"])
        business_ids["business_id"] = business_encoder.inverse_transform(business_ids["business_id_matrix"])

        return df, user_ids, business_ids

    def create_sparse_matrix(self, user_active_matrix, user_ids, business_ids):
        highest_user_id = len(user_ids['user_id'].unique())
        highest_business_id = len(business_ids['business_id'].unique())
        shape_matrix = (highest_user_id, highest_business_id)
        ratings_mat = sparse.lil_matrix(shape_matrix)
        for i, row in user_active_matrix.iterrows():
            ratings_mat[row["user_id_matrix"], row["business_id_matrix"]] = row["stars"]
        return ratings_mat

    def svd_pred(self, ratings_mat, n_factor=15):
        U, sigma, Vt = svds(ratings_mat, k=n_factor)
        sigma = np.diag(sigma)
        all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

        # Normalization
        all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (
                all_user_predicted_ratings.max() - all_user_predicted_ratings.min())
        return all_user_predicted_ratings_norm

    def fit(self, n_factor=15):
        logger.info("Start building matrix")
        start = time.time()
        self.cf_preds_df = None

        user_active_matrix, self.user_ids, self.business_ids = self.create_utility_matrix(self.user_active_df)

        # self.cf_preds_df = None
        rating_mat = self.create_sparse_matrix(user_active_matrix, self.user_ids, self.business_ids)
        self.cf_preds_df = self.svd_pred(ratings_mat=rating_mat, n_factor=n_factor)
        logger.info("Matrix built in %s seconds." % (time.time() - start))

    def transform(self, user_id, filters={}, cf_w=0.8, geo_w=0.2, topn=1000):
        logger.info("Start predicting")
        start = time.time()
        recommendations_df = pd.DataFrame()

        user_id_num = self.user_ids[self.user_ids["user_id"] == user_id].user_id_matrix.values
        if user_id_num.size > 0:
            user_id_num = user_id_num.astype(int)[0]
            pred_user = self.cf_preds_df[user_id_num, :]
            sorted_user_predictions = pd.DataFrame(pred_user, columns=["score"])
            sorted_user_predictions = pd.merge(sorted_user_predictions, self.business_ids, left_index=True,
                                               right_on="business_id_matrix") \
                .drop(columns=["business_id_matrix"]) \
                .sort_values(ascending=False, by="score") \
                .reset_index(drop=True)

            items_to_ignore = self.user_active_df[self.user_active_df["user_id"] == user_id]
            items_to_ignore = items_to_ignore["business_id"].unique()

            # Recommend the highest predicted rating movies that the user hasn't seen yet.
            recommendations_df = sorted_user_predictions[~sorted_user_predictions['business_id'].isin(items_to_ignore)] \
                .sort_values('score', ascending=False)
        
                        
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                recommendations_df = pd.merge(left=recommendations_df, right=nearby_df, how="inner", on="business_id")
                if not recommendations_df.empty:
                    recommendations_df = recommendations_df.rename(columns={"score": "cf_score"})
                    recommendations_df["score"] = cf_w * recommendations_df["cf_score"] + geo_w * recommendations_df["geo_score"]
                    recommendations_df = recommendations_df.sort_values(by="score", ascending=False)            
                
            recommendations_df = recommendations_df.head(topn)

            logger.info("Prediction done in %s seconds." % (time.time() - start))
            return items_to_ignore, recommendations_df
        else:
            logger.info("This user %s is not exist" % (user_id))
            return np.array([]), recommendations_df

    def rmse(self, true, pred):
        # this will be used towards the end
        x = true - pred
        return sum([xi * xi for xi in x]) / len(x)

In [25]:
class EngineCF:

    def __init__(self, business_df, review_df, ye):
        logger.info("Initilizing Collaborative Filtering Engine")
        self.business_df = business_df
        self.ye = ye
        self.model = CustomSVD(review_df=review_df)

    def train(self, n_factor=15):
        #user_active_df = self.init_user_df()
        self.model.fit(n_factor=n_factor)

    def predict(self, user_id, filters={}, topn=1000):
        return self.model.transform(user_id=user_id, filters=filters, topn=topn)

    def save_model(self):
        with open(api_path + "models/customsvd.model", "wb") as f:
            pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_model(self):
        try:
            with open(api_path + "models/customsvd.model", "rb") as f:
                self.model = pickle.load(f)
                logger.info("Custom SVD model loaded")
        except (FileNotFoundError, IOError):
            logger.info("File not found")

In [26]:
filters={}
filters["nearby"] = business_nearby

In [27]:
#train, test = train_test_split(user_active_df, test_size = 0.2)

In [28]:
engine_cf = EngineCF(business_df=business_df, review_df=review_df, ye=ye)

INFO:__main__:Initilizing Collaborative Filtering Engine


In [29]:
engine_cf.train()

INFO:__main__:Start building matrix
INFO:__main__:Matrix built in 21.435007572174072 seconds.


In [30]:
# TODO Test --2vR0DIsmQ6WfcSzKWigw

In [31]:
engine_cf.predict(user_id="uU2B1vrqkpWJI30xoooabw", filters=filters, topn=50)

INFO:__main__:Start predicting
INFO:__main__:Prediction done in 0.06001734733581543 seconds.


(array(['-NR4KqS6lHseNvJ-GFzfMA', '0VjHFdczi6Nln_nn8bucJQ',
        '31cOttU_tML5b6eLLXZfsQ', '3fT1kcQ-MVEImGHa3hll5w',
        '6e0Abng1LRexKI4LRJTX_A', '7sb2FYLS2sejZKxRYF9mtg',
        '9AnvV8V-UvA_rFhMCr_Dlw', 'A0X1baHPgw9IiBRivu0G9g',
        'C2tt_-1YE4X2T7azXEF7IQ', 'D6K3VT6S8FwFm4u5uN8T8g',
        'EZ4TljJvGenxrkM4JsqtZg', 'Gml97gSULiA6HFjie01d3Q',
        'IZivKqtHyz4-ts8KsnvMrA', 'KDdu9HDswaJYExaGDuzRjQ',
        'KkgnBZj3jRVjCY2H49JRaw', 'L6RxkP8dneVGelDLUhjCug',
        'LIU7IcJtD9VieoIo__wd9Q', 'LPMZ9N1sAjs2nDx7DmiZ2w',
        'NEZCr0zLIPNYWNF57KsJJQ', 'NLEe-RzDSU-5BN6xp_WWCw',
        'NiQ_3tXqkoe6GaQ5w2sHaQ', 'Os1n1_idfw9vv9kwULGJnQ',
        'Q-dVQgyof9paXKJLwFsn3Q', 'QCe_OTC_9Vf5-oQuflQH7Q',
        'QUxY5s-cmQYgUPd2Dc_sOw', 'Rv1IiQaIIDunqzS8dSvHCw',
        'S_5VQYMMa4aD3NzgYLkvhg', 'SqxIx0KbTmCvUlOfkjamew',
        'TEpHybMZew9kKadvseEbEQ', 'TtqvXhjmXXd0nhibRAbpRg',
        'VD0zZKG_ZEz_VYlKLtu9Zg', 'WrHu03srbhJAw6bnTIixdw',
        'ZXfMiPsPv-cH7GMQ-5-mhA', 'ZrS5r

## 5.2 Memory Based Friendlist
<a id='cf_memory'></a>

In [75]:
class FriendSim:

    def __init__(self, ye):
        self.ye = ye
        
    def find_n_neighbours(self, df, nrows, n):
        if n>nrows:
            n = nrows
        order = np.argsort(df.values, axis=1)[:, :n]
        df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
               .iloc[:n].index, 
              index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
        return df

    def standardize(self, row):
        new_row = (row - row.mean()) / (row.max() - row.min())
        return new_row

    def get_active_user(self, review_df, n_active=6):
        user_review_df_count = review_df.groupby("user_id").size()
        user_review_active_values = user_review_df_count[user_review_df_count >= n_active].reset_index()[
            "user_id"].values
        user_active_df = review_df[review_df["user_id"].isin(user_review_active_values)]
        user_active_df = user_active_df.groupby(["user_id", "business_id"], as_index=False).mean()
        return user_active_df

    def recommend(self, user_id):

        #Get friend of the user in the parameter
        mustArray = [
            self.ye.bodySingleMatch("user_id", user_id)
        ]
        include_list = ["name", "user_id", "friends", "yelping_since", "review_count",
                        "average_stars", "elite"]
        user_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-user*",
                                                 mustArray=mustArray, filterArray=[],
                                                 include = include_list, size=2000)

        print("Total reviews retrieved: %d" % (user_fisrt_chunk["hits"]["total"]["value"]))
        user_df = self.ye.getResultScrolling(user_fisrt_chunk)
        friends_str = user_df["friends"].values[0]
        friends_list = np.unique(friends_str.split(", "))
        print("This user has %d friends" % (len(friends_list)))

        if friends_list.size > 0:
            friends_list = np.append(friends_list, np.array(user_id))
            mustArray = [
                self.ye.bodyMultivalueTerm("user_id.keyword", friends_list.tolist()),
                self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31")

            ]
            review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review*",
                                                     mustArray=mustArray, filterArray=[],
                                                     include=["user_id", "business_id", "stars"], size=2000)

            review_df = self.ye.getResultScrolling(review_fisrt_chunk)
            user_active_df = self.get_active_user(review_df=review_df)
            
            #check if user is still in user_active_df (cold start)
            if ( any(user_active_df.user_id== user_id) == False):
                return 1
            
            #retrieve all businesses seen by user to not recommend them later
            businesses_seen_by_user = user_active_df[user_active_df.user_id == user_id].business_id.unique()
            
            
            #************ old method *****************
            Mean = user_active_df.groupby(by="user_id",as_index=False)['stars'].mean()
            Mean.rename(columns={'stars': 'mean'}, inplace=True)
            Rating_avg = pd.merge(user_active_df,Mean,on='user_id')
            Rating_avg['adg_rating']=Rating_avg['stars']-Rating_avg['mean']
            
            #Building matrix by pivot table
            rating_mat = pd.pivot_table(
                data=Rating_avg,
                index="user_id",
                columns="business_id",
                values="stars",
                fill_value=0)
            
            rating_standardized = rating_mat.fillna(rating_mat.mean(axis=0))

#             #********** new method ****************
#             #Building matrix by pivot table
#             rating_mat = pd.pivot_table(
#                 data=user_active_df,
#                 index="user_id",
#                 columns="business_id",
#                 values="stars",
#                 fill_value=0)
#             rating_standardized = rating_mat.apply(self.standardize)
            
            nrows = rating_standardized.shape[0]
            item_similarity = cosine_similarity(rating_standardized)
            np.fill_diagonal(item_similarity, 0)
            item_similarity_df = pd.DataFrame(item_similarity, index=rating_mat.index, columns=rating_mat.index)
            sim_user_30_m = self.find_n_neighbours(item_similarity_df, nrows,30)
            
            # ******* justify user similarities **********
            neighbors_ids = sim_user_30_m[(sim_user_30_m.index.get_level_values('user_id') == user_id)].values.tolist()[0]
            justifications = pd.DataFrame()   
            for i in range (0,len(neighbors_ids)):
                common_restaurants = Rating_avg[Rating_avg.user_id == user_id].merge(Rating_avg[Rating_avg.user_id == neighbors_ids[i]],
                on = "business_id",
                how = "inner" )
                justifications = pd.concat([justifications, common_restaurants])
#             justifications = justifications.rename(columns = {'user_id_x':'user','user_id_y':'neighbor_id',
#                                                              'star_x':'stars_user', 'stars_y':'stars_neighbor'})
            test = justifications.groupby(by=["user_id_x","user_id_y"],as_index=False)['adg_rating_x','adg_rating_y'].mean()
            test.rename(columns={'adg_rating_x': 'user_avg','adg_rating_y': 'neighbor_avg'}, inplace=True)
            test['user_avg'] = test['user_avg'].map(lambda x: round(x,3))
            test['neighbor_avg'] = test['neighbor_avg'].map(lambda x: round(x,3))
            test['diff'] = abs(test['user_avg'] - test['neighbor_avg'])
            test = test.sort_values(by='diff',ascending=True)
            
            
            ###computing scores for each unseen business by user 
            business_user = Rating_avg.groupby(by = 'user_id')['business_id'].apply(lambda x:','.join(x))
            a = sim_user_30_m[sim_user_30_m.index==user_id].values
            b = a.squeeze().tolist()
            d = business_user[business_user.index.isin(b)]
            l = ','.join(d.values)
            businesses_seen_by_similar_users = l.split(',')
            businesses_under_consideration = list(set(businesses_seen_by_similar_users)-set(list(map(str, businesses_seen_by_user))))
            #businesses_under_consideration = list(map(int, businesses_under_consideration))
            score = []
            for item in businesses_under_consideration:
                c = rating_standardized.loc[:,item]
                d = c[c.index.isin(b)]
                f = d[d.notnull()]
                avg_user = Mean.loc[Mean['user_id'] == user_id,'mean'].values[0]
                index = f.index.values.squeeze().tolist()
                corr = item_similarity_df.loc[user_id,index]
                fin = pd.concat([f, corr], axis=1)
                fin.columns = ['adg_score','correlation']
                fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
                nume = fin['score'].sum()
                deno = fin['correlation'].sum()
                final_score = avg_user + (nume/deno)
                score.append(final_score)
            data = pd.DataFrame({'business_id':businesses_under_consideration,'score':score})
            top_recommendation = data.sort_values(by='score',ascending=False)
            ###noralize score between 0 and 1
            top_recommendation = top_recommendation.apply(lambda x: (x-min(x))/(max(x)-min(x)) if x.name == 'score' else x)
            return test

In [76]:
fs = FriendSim(ye=ye)

In [77]:
neighbors = fs.recommend(user_id="AyjqBovADgbskmLrIBOMlQ") #AyjqBovADgbskmLrIBOMlQ")
# pd_pvt = test[(test.index.get_level_values('user_id') == "AyjqBovADgbskmLrIBOMlQ")]
# pd_pvt
neighbors

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-user*/_search?_source_excludes=&_source_includes=name%2Cuser_id%2Cfriends%2Cyelping_since%2Creview_count%2Caverage_stars%2Celite&scroll=1m&size=2000 [status:200 request:0.016s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.014s]


Total reviews retrieved: 1
This user has 506 friends


INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-review*/_search?_source_excludes=&_source_includes=user_id%2Cbusiness_id%2Cstars&scroll=1m&size=2000 [status:200 request:0.201s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.141s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.144s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.141s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.144s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.140s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.033s]


Unnamed: 0,user_id_x,user_id_y,user_avg,neighbor_avg,diff
1,AyjqBovADgbskmLrIBOMlQ,60skTN6p8SdQ2sGNKL0vRQ,-0.002,0.044,0.046
22,AyjqBovADgbskmLrIBOMlQ,cg2P244yON3-_GXWkgAgsw,0.205,0.152,0.053
3,AyjqBovADgbskmLrIBOMlQ,8XKSiMpMxgdwI3qjuqmXFg,0.071,0.018,0.053
23,AyjqBovADgbskmLrIBOMlQ,fj-cFMD2qheCcv3FNvMcEg,-0.18,-0.12,0.06
27,AyjqBovADgbskmLrIBOMlQ,ve_Hbp67wIvWFNEiwV2vuw,0.161,0.1,0.061
9,AyjqBovADgbskmLrIBOMlQ,JtpZw-NLL5m6BNjd0LPQVQ,-0.128,-0.058,0.07
24,AyjqBovADgbskmLrIBOMlQ,hX0-0jfbXUNUTzyGtg090g,-0.017,0.087,0.104
11,AyjqBovADgbskmLrIBOMlQ,NhgU7RhuYYFmpkb1jlYJ6Q,0.142,0.246,0.104
16,AyjqBovADgbskmLrIBOMlQ,Yv7VHEZWLMmcBDvIlLh69w,0.15,0.039,0.111
28,AyjqBovADgbskmLrIBOMlQ,yyDp7MZ2st7p0fOQuFYpcA,0.17,0.283,0.113


In [16]:
test

Unnamed: 0,business_id,score
1907,L1-1P3acJc4gEFvWwjXcNQ,1.000000
1944,beuVp5CZxCdNvQIIPBS2rw,0.964212
1053,sZIVzaaEBp_HiYutZ2lWag,0.959333
1103,po0p6NIro0cDrmKkcyPy0w,0.941904
1428,YILyHegzhy1vlc_LNVfObw,0.904806
...,...,...
628,MU1PQ5CWuV0OKKOeI7jx6Q,0.000374
179,RX8Q4_nu3VnAwXtHdgAKCg,0.000070
1391,VAD-Faox-xQdE1WdYn8_5Q,0.000000
639,vl2IZrNJEA8npSjqXbdwxw,0.000000


In [8]:
fs.find_n_neighbours(test, 30)

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-C-l8EHSLXtZZVfUAUhsPA,JtpZw-NLL5m6BNjd0LPQVQ,VxljrF5dBkgMm_C4aLJXlg,RSLTG7N0wYJhRTW5CGEm5Q,q93KSB8PtxchdgcY17bH8Q,P4XUvN_8alI6dpTC_3LMkg,6ImN7CLRvsDnrAx6DF4nIw,yyDp7MZ2st7p0fOQuFYpcA,Pub1A9LA1NSg7jNJjbUPJA,wdJKGY0xI5jpFrEL4Gm2fQ,HtHjamuBBMHsQgnP9s2ydA,...,dOj1z8X0sE0uyCmV1hTz3g,sEAzELf3_F-8j49umuYyZg,ZnznUorow2SkhWTT5saAxA,qWeiidtgba1AbPBnwJfleA,8XKSiMpMxgdwI3qjuqmXFg,WWLArhsfQz7C47CaPnudzQ,fExL6u06xTSg13k945IPAQ,BOQ_p0wtX8Xl9k2R9FBiBg,8drMKNHWavs2g6uf0pLtvg,UmzXKH73i8fixZBuB22oCw
-xDW3gYiYaoeVASXywTPgw,5OllWsrKJsYo3XQK6siRKA,WJhQpdUHoFCtpU6gHFjVlQ,XOCyTfXy26xi4gZeun9v9w,UOTY05n5PmV7aARc7_UHIw,CQZpblGIHeMlD7KqNP7J7Q,b7rchNJEp6Gaw0bHR-4s4g,FQbVI3UyKlL-HfQGcRF8aA,60skTN6p8SdQ2sGNKL0vRQ,sjzv-c1k_HGGT9vZbfimWw,haSh72Q0MsQZUpWPeVgp0Q,...,y4xrOPXEB75lJaLjr7Rtaw,VhkjxxBz-A4M-mMncpn1ZA,AyjqBovADgbskmLrIBOMlQ,RnTfhTi7VVEAMzlBXKvDsw,hltCnxRt4SFkdQwOzgDN9A,U9aadPV7mQXAbXiqXQ1jDg,j2rFqYXfK8dN-CxULh6aUg,8Ae52apMLbD3jhHcaQ_Zzw,S5Eld1fZx6D8d9i5YbFOaA,Y8adrWFzckrjuC11aAs9Og
0o8HUzggoNKay9-ZMj3HqQ,U9aadPV7mQXAbXiqXQ1jDg,HtHjamuBBMHsQgnP9s2ydA,sEAzELf3_F-8j49umuYyZg,qIJ2iA9JwRYZD0akh1nyRA,Te7_l78PIzdfiu9ki3Jrbg,dOj1z8X0sE0uyCmV1hTz3g,w0W_hs1EX6x7XuZsEkHDDQ,qWeiidtgba1AbPBnwJfleA,ZnznUorow2SkhWTT5saAxA,VxljrF5dBkgMm_C4aLJXlg,...,X2Cf71Ab7EM9Yz4qJnAfug,6ImN7CLRvsDnrAx6DF4nIw,hClOQIcmC_Tz-7n1VMRD1Q,KLUvRyRDUftXgFvUx50Kyg,wAJddqlvejbg1HHEPwWkjQ,YTl5q-1rvWGqtPs1bX7-hQ,ve_Hbp67wIvWFNEiwV2vuw,bzU8Cl6B9nqV1i5NReobvw,-C-l8EHSLXtZZVfUAUhsPA,P4XUvN_8alI6dpTC_3LMkg
17f-n5Rouu89FSKuc4c7xw,ZnznUorow2SkhWTT5saAxA,kOT_C9B-dGITaK_vpBeYNg,U9aadPV7mQXAbXiqXQ1jDg,HtHjamuBBMHsQgnP9s2ydA,sEAzELf3_F-8j49umuYyZg,YTl5q-1rvWGqtPs1bX7-hQ,Te7_l78PIzdfiu9ki3Jrbg,w0W_hs1EX6x7XuZsEkHDDQ,dOj1z8X0sE0uyCmV1hTz3g,qWeiidtgba1AbPBnwJfleA,...,S5Eld1fZx6D8d9i5YbFOaA,b7rchNJEp6Gaw0bHR-4s4g,eZrS8SeTgX4c7ir3G1-8VQ,3ajSADFnm71e_Zcip_igWQ,6ImN7CLRvsDnrAx6DF4nIw,qIJ2iA9JwRYZD0akh1nyRA,Z3S7Y6ywAOrWUS-jevfu6Q,ve_Hbp67wIvWFNEiwV2vuw,wAJddqlvejbg1HHEPwWkjQ,Rdt-JiAN7Qlw3dJ7UVdNtw
1LrSW8RZ7gvXgnknqjd8fg,Te7_l78PIzdfiu9ki3Jrbg,Z3S7Y6ywAOrWUS-jevfu6Q,yP4Xs0pRH8Uqn-BaEOmJFA,sjzv-c1k_HGGT9vZbfimWw,P4XUvN_8alI6dpTC_3LMkg,baW-EBVOGmLMuUgUdsiUpw,kOT_C9B-dGITaK_vpBeYNg,HtHjamuBBMHsQgnP9s2ydA,U9aadPV7mQXAbXiqXQ1jDg,ve_Hbp67wIvWFNEiwV2vuw,...,os-sDQoh-hCt2Lg76z6ZUA,qWeiidtgba1AbPBnwJfleA,ZnznUorow2SkhWTT5saAxA,WWLArhsfQz7C47CaPnudzQ,VxljrF5dBkgMm_C4aLJXlg,UmzXKH73i8fixZBuB22oCw,fExL6u06xTSg13k945IPAQ,J8yImoNCuugxr39znUJanA,QjZN6x1hGm2Edj95FfpKUA,KJRQUJEkWVebUImqc4vW_Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yyDp7MZ2st7p0fOQuFYpcA,2EuPAGalYnP7eSxPgFCNDg,D43OWyfzIQjL8feJpYh2SQ,CqeLXgQUpMZNbz4GwBz87w,hX0-0jfbXUNUTzyGtg090g,AyjqBovADgbskmLrIBOMlQ,bLbSNkLggFnqwNNzzq-Ijw,8XKSiMpMxgdwI3qjuqmXFg,-C-l8EHSLXtZZVfUAUhsPA,KJRQUJEkWVebUImqc4vW_Q,QcckMs29Z47tuSyWDCMUlA,...,wmyoMUiW6YU9Audm0lACDg,PVB0ggrspa1wWUECI7BN6w,CZnuW6YeZg7ZIiTbmI58BQ,BuKN22G8IY_rIuLLZ1Z9Xw,KLUvRyRDUftXgFvUx50Kyg,NhgU7RhuYYFmpkb1jlYJ6Q,Fv0e9RIV9jw5TX3ctA1WbA,SDgrUDEdWNFBlC4ufVk1PQ,yyDp7MZ2st7p0fOQuFYpcA,rYPjqZO89ABei1fsZBSUbA
z4RytucxI_XfcMFaEI2DRg,J8yImoNCuugxr39znUJanA,iyI93tcSuQ54ckGyjaLP8g,-C-l8EHSLXtZZVfUAUhsPA,CqeLXgQUpMZNbz4GwBz87w,SDgrUDEdWNFBlC4ufVk1PQ,zfDcvo9F7d9fAA_hWcBC5Q,qIJ2iA9JwRYZD0akh1nyRA,8XKSiMpMxgdwI3qjuqmXFg,yXzEJJaSRLgXfCzCWxRWGg,yyDp7MZ2st7p0fOQuFYpcA,...,ZsjuEgOdvx9l5GKZdtwYIw,wAJddqlvejbg1HHEPwWkjQ,CZnuW6YeZg7ZIiTbmI58BQ,w0W_hs1EX6x7XuZsEkHDDQ,HtHjamuBBMHsQgnP9s2ydA,WJhQpdUHoFCtpU6gHFjVlQ,Rdt-JiAN7Qlw3dJ7UVdNtw,VDsnV9Xc-akVbJC0hVtAQw,QSMsF78PxVYiAwwwHeZ7xA,S5Eld1fZx6D8d9i5YbFOaA
z8wN6EQuw_JF-zTjcNgsLw,WWLArhsfQz7C47CaPnudzQ,Yv7VHEZWLMmcBDvIlLh69w,YB2SZbMgQHBOGOIsBK2Tkg,tp77HGAm4JVX3BL50v7Ajg,zfDcvo9F7d9fAA_hWcBC5Q,P4XUvN_8alI6dpTC_3LMkg,ZsjuEgOdvx9l5GKZdtwYIw,uc9ITBuspRFkl-S3Bo90dg,KJRQUJEkWVebUImqc4vW_Q,S5Eld1fZx6D8d9i5YbFOaA,...,w0W_hs1EX6x7XuZsEkHDDQ,jZUpqurVtlrN2QrgLxQzZw,nrUQRMx6K-KJnoC15rMO2g,Te7_l78PIzdfiu9ki3Jrbg,dOj1z8X0sE0uyCmV1hTz3g,ZnznUorow2SkhWTT5saAxA,qWeiidtgba1AbPBnwJfleA,sEAzELf3_F-8j49umuYyZg,QSMsF78PxVYiAwwwHeZ7xA,fExL6u06xTSg13k945IPAQ
zW3PaTqR8fN-mnoYWk6Ymg,Rdt-JiAN7Qlw3dJ7UVdNtw,HtHjamuBBMHsQgnP9s2ydA,U9aadPV7mQXAbXiqXQ1jDg,vvxIVr_OR6sHBzGPG9dRXg,rKdGiLfeE55TClso9GtjsA,iyI93tcSuQ54ckGyjaLP8g,sEAzELf3_F-8j49umuYyZg,w0W_hs1EX6x7XuZsEkHDDQ,Te7_l78PIzdfiu9ki3Jrbg,dOj1z8X0sE0uyCmV1hTz3g,...,zfDcvo9F7d9fAA_hWcBC5Q,j2rFqYXfK8dN-CxULh6aUg,tp77HGAm4JVX3BL50v7Ajg,RArQt74He3Ok8asTRUw2FQ,3ajSADFnm71e_Zcip_igWQ,FcWu03iLuGYGHbFXO556pA,6ImN7CLRvsDnrAx6DF4nIw,YTl5q-1rvWGqtPs1bX7-hQ,YB2SZbMgQHBOGOIsBK2Tkg,qIJ2iA9JwRYZD0akh1nyRA


## 5.3 Test

In [None]:
user_id = "EC5nxNCWCmjHg1F14WrlxQ"

In [None]:
to_ignore = engine_cf.already_review(user_id)

In [None]:
rec_cf = engine_cf.predict(user_id, items_to_ignore=to_ignore)

In [None]:
rec_cf = business_details(business_df, rec_cf)

In [None]:
rec_cf.head(5)

In [None]:
show_map(loc=simulation_loc, cf=rec_cf)

# 6. Hybrid

[Back to menu](#menu)

**Geolocation**

In [None]:
business_df_nearby.head(5)

**Popularity Based**

In [None]:
rec_pop[["business_id", "name", "categories", "score"]].head(5)

**Content Based**

In [None]:
rec_cb[["business_id", "name", "categories", "score"]].head(5)

**Collaborative Filtering**

In [None]:
rec_cf[["business_id", "name", "categories", "rec_score"]].head(5)

In [None]:
show_map(loc=simulation_loc, popularity=rec_pop, contentbased=rec_cb, cf=rec_cf)

## 6.1 Compute weight