<a id='menu'></a>
# Menu

[1. Data Processing](#data_proc)

[2. Geolocalisation](#geo_loc)

[3. Popularity](#pop)

[4. Content Based](#content)

[5. Collaborative Filtering](#cf)

- [5.1 Model Based SVD](#cf_model)
    
- [5.2 Memory Based Memory](#cf_memory)
    

# Libraries

**Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

**Word2Vec**

In [2]:
from nltk.tokenize import word_tokenize
import gensim 
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Visualization**

In [3]:
import folium
import html

In [4]:
import time

**Path**

In [5]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

In [6]:
api_path = "/home/hongphuc95/notebookteam/api/"

In [7]:
import sys
pathModulesES = '../sauceforyall/'
sys.path.append(pathModulesES)
from yelpquery import YelpQuery
from pandasticsearch import Select
ye = YelpQuery()

In [8]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

<a id='data_proc'></a>
# 1. Load data

[Back to menu](#menu)

In [9]:
business_df = pd.read_json(data_path + "business.json", lines=True)

In [10]:
business_df = business_df.dropna(subset=["categories"]).reset_index(drop=True)

In [11]:
#review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019.json", lines=True)

## 1.1 Useful functions

In [12]:
def business_details(business_df, review_df):
    business_df = business_df[["business_id", "name", "categories", "address", "city", "state", "latitude", "longitude", "stars", "review_count"]]
    new_df = pd.merge(business_df, review_df, how="inner", on="business_id")
    return new_df

In [13]:
def show_map(loc, radius=2000, popularity=None, contentbased=None, cf=None):
    lat = loc["latitude"]
    long = loc["longitude"]
    mp = folium.Map(location=[lat, long], zoom_start=12, radius=200000, fill_color='#3186cc', line_color='#3186cc')
    
    if popularity is not None:
        for _, r in popularity.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='blue')).add_to(mp)

    if contentbased is not None:
        for _, r in contentbased.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='orange')).add_to(mp)
            
    if cf is not None:
        for _, r in cf.iterrows():
            folium.Marker(
                        location =[r.latitude, r.longitude], 
                        popup = html.escape(
                            r["name"]) + '<br>' + 
                            'Stars: ' + str(r.stars) + '<br>' + 
                            'Reviews: ' + str(r.review_count) + '<br>' + 
                            'Categories: ' + str(r.categories) + '<br>',    
                        icon = folium.Icon(color='red')).add_to(mp)
            
    return mp

<a id='geo_loc'></a>
# 2. Geolocalisation

[Back to menu](#menu)

In [14]:
from geopy.exc import GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable
from geopy.geocoders import Nominatim
from math import radians, cos, sin, asin, sqrt, pi
import requests
from sklearn.neighbors import BallTree


class Geolocation:
    token = "5e599797dbecfc222d30063da4b86640"
    send_url = "http://api.ipstack.com/check?access_key=" + token

    def __init__(self, business_df):
        self.business_df = business_df
        self.location = {}
        self.distance_df = pd.DataFrame()
        self.geolocator = Nominatim(user_agent="Data_Dive_Prod")
        
    def deg2rad(self, degree):
        '''
        function to convert degree to radian
        '''
        rad = degree * (pi/180)
        return(rad)
    
    def fit(self):
        coordinates = self.business_df[["latitude", "longitude"]].apply(self.deg2rad)
        self.tree = BallTree(coordinates, metric='haversine')
        
    def normalize(self, df):
        dataNorm = ((df - df.min()) / (df.max() - df.min()))
        return dataNorm

    def reset(self):
        self.location = {}
        self.distance_df = pd.DataFrame()
        

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formule
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of earth in kilometers.
        res = c * r
        return np.round(res, 4)

    def show_current_location(self):
        pass
        #logger.info("Current location: (Lat: %s, Long: %s)" % (self.location["latitude"], self.location["longitude"]))

    def get_current_location(self):
        return self.location

    def get_business_nearby(self):
        return self.distance_df

    def get_info_coordinate(self, coordinate):
        gps = str(coordinate["latitude"]) + ',' + str(coordinate["longitude"])
        location = self.geolocator.reverse(gps)
        self.location["city"] = location.raw["address"]["city"]
        self.location["country"] = location.raw["address"]["country"]
        self.show_current_location()

    def get_coordinate_address(self, address):
        location = None
        try:
            location = self.geolocator.geocode(address)
        except (GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable):
            time.sleep(1)
            try:
                location = self.geolocator.geocode(address)
            except (
                    GeocoderTimedOut, GeocoderServiceError,
                    GeocoderUnavailable):
                #logger.info('GeocoderServiceError occored')
                return None, None

        #logger.info(location)
        if location:
            self.location["latitude"] = location.latitude
            self.location["longitude"] = location.longitude
            coordinate = {"latitude": location.latitude,
                          "longitude": location.longitude}
            self.get_info_coordinate(coordinate=coordinate)

    def get_coordinate_ip_address(self):
        geo_req = requests.get(self.send_url)
        response = geo_req.json()
        self.location["latitude"] = float(response["latitude"])
        self.location["longitude"] = float(response["longitude"])
        self.location["city"] = response["city"]
        self.location["country"] = response["country_name"]
        self.show_current_location()

    def get_neighbors_recommend(self, lookup="", engine=True, rec_range=5):

        if engine:
            if not lookup:
                self.get_coordinate_ip_address()
            else:
                self.get_coordinate_address(lookup)
        else:
            self.location = lookup
            self.get_info_coordinate(coordinate=lookup)

        input_point = [[self.deg2rad(self.location["latitude"]), self.deg2rad(self.location["longitude"])]]
        nearest_point = self.tree.query_radius(input_point, r=rec_range / 6371)[0]

        self.distance_df = pd.DataFrame()
        self.distance_df = self.business_df[["business_id", "longitude", "latitude"]]
        self.distance_df = self.business_df[self.business_df.index.isin(nearest_point)]
        if not self.distance_df.empty:
            self.distance_df["distance"] = self.distance_df.apply(
                lambda x: self.haversine_distance(self.location["latitude"], self.location["longitude"], x["latitude"],
                                                  x["longitude"]), axis=1)

            self.distance_df = self.distance_df.sort_values(ascending=True, by="distance").reset_index(drop=True)
            self.distance_df = self.distance_df[["business_id", "distance"]]

            # Normalization
            self.distance_df["geo_score"] = self.normalize(self.distance_df["distance"])
            self.distance_df["geo_score"] = 1 - self.distance_df["geo_score"]

In [15]:
class EngineGeo:

    def __init__(self, business_df):
        self.geoloc = Geolocation(business_df=business_df)
        self.geoloc.fit()
        
    def reset(self):
        self.geoloc.reset()

    def get_business_nearby(self):
        return self.geoloc.get_business_nearby()

    def get_current_location(self):
        return self.geoloc.get_current_location()

    def recommend(self, lookup="", engine=True, rec_range=5):
        self.geoloc.get_neighbors_recommend(lookup=lookup, engine=engine, rec_range=rec_range)

In [16]:
engine_geo = EngineGeo(business_df)

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [17]:
#engine_geo.get_coordinate_address("Denver")

In [18]:
test_coor = {"latitude": 36.1672559, "longitude": -115.1485163}

In [19]:
engine_geo.recommend(lookup="Las Vegas", engine=True, rec_range=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
engine_geo.get_current_location()

{'latitude': 36.1672559,
 'longitude': -115.1485163,
 'city': 'Las Vegas',
 'country': 'United States of America'}

In [21]:
business_nearby = engine_geo.get_business_nearby()

In [22]:
business_nearby

Unnamed: 0,business_id,distance,geo_score
0,3fdtp-bzoE4ZgTakkcEBzQ,0.0218,1.000000
1,Vyadl8RsxaFaAFjm98lNTQ,0.0218,1.000000
2,VhazKK6zbHrdJaN-bmeHbQ,0.0361,0.997127
3,j5pWQfzFuJdYUXb-vKHgyA,0.1157,0.981137
4,jmketuCDahSV1-47orzdMg,0.1164,0.980997
...,...,...,...
5197,B19KKE75ZxqoR3EDuvA1qw,4.9921,0.001567
5198,cPtybPHvMvZygjLkgvM4GA,4.9930,0.001386
5199,FhqoXrpfw5ji_Qoh4GcJMg,4.9944,0.001105
5200,PZ3qgjUMg8akZaP0CssYNA,4.9997,0.000040


<a id='pop'></a>
# 3. Popularity Based

[Back to menu](#menu)

## 3.1 Methods

In [23]:
class Popularity:

    def __init__(self, business_df, ye, review_df, checkin_df):
        self.business_df = business_df
        self.ye = ye
        self.review_df = review_df
        self.checkin_df = checkin_df

    def get_model_name(self):
        return self.MODEL_NAME
    
    def normalize(self, df):
        dataNorm = ((df - df.min()) / (df.max() - df.min()))
        return dataNorm

    def recommend(self, top_n=50, filters={}, geo_w=0.2, pop_w=0.8, elastic=True):
        mask = np.array([])
        recommendations_df = pd.DataFrame()
        nearby_df = pd.DataFrame()
        if filters:
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if mask.size == 0:
                    mask = nearby_ids
                else:
                    mask = np.intersect1d(mask, nearby_ids)

            
            if "categories" in filters and filters["categories"]:
                mask_cat = self.business_df[self.business_df["categories"].str.contains(filters["categories"])][
                    "business_id"].values
                mask = np.intersect1d(mask, mask_cat)

            # review_df = self.review_df[self.review_df["business_id"].isin(mask)]
            if mask.size > 0:
                if elastic:
                    mustArray = [
                        self.ye.bodyMultivalueTerm("business_id.keyword", np.unique(mask).tolist()),
                        self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31")
                    ]
                    review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review*",
                                                                  mustArray=mustArray, filterArray=[],
                                                                  include=["business_id", "user_id", "stars"], 
                                                                  size=2000)

                    print("Total reviews retrieved: %d" % (review_fisrt_chunk["hits"]["total"]["value"]))
                    review_df = self.ye.getResultScrolling(review_fisrt_chunk)
                else:
                    review_df = self.review_df[self.review_df["business_id"].isin(mask)]

                recommendations_df = review_df.groupby("business_id")["stars"] \
                    .agg(["sum", "count"]) \
                    .reset_index()

                recommendations_df["ratings_avg"] = (recommendations_df["sum"] / recommendations_df["count"])
                recommendations_df = recommendations_df \
                    .sort_values(ascending=False, by="ratings_avg") \
                
                #Normalization of popularity score
                recommendations_df["score_rating"] = self.normalize(recommendations_df["ratings_avg"])
                
                #Join df above and checkins
                recommendations_df = pd.merge(left=recommendations_df, right=self.checkin_df, how="inner",
                                                 on="business_id")

                #Normalization of popularity score
                recommendations_df["score_checkin"] = self.normalize(recommendations_df["all_years"])

                recommendations_df["pop_score"] = (0.5 * recommendations_df["score_rating"]) + (0.5 * recommendations_df["score_checkin"])
                #print()
                
                if not nearby_df.empty:
                    #Join geolocation and rating normalized score
                    recommendations_df = pd.merge(left=recommendations_df, right=nearby_df, how="inner",
                                                  on="business_id")
                    
            
                    #recommendations_df = recommendations_df.rename(
                    #    columns={"score": "pop_score"})

                    #recommendations_df["geo_score"] = recommendations_df["geo_score"] * geo_w
                    #recommendations_df["pop_score"] = recommendations_df["pop_score"] * pop_w
                    recommendations_df["score"] = recommendations_df["geo_score"] * geo_w + recommendations_df["pop_score"] * pop_w

                    recommendations_df = recommendations_df.sort_values(ascending=False, by="score")

                recommendations_df = recommendations_df.head(top_n)

        return recommendations_df

In [24]:
class EnginePopularity:

    def __init__(self, business_df, ye, review_df, checkin_df):
        self.pop = Popularity(business_df=business_df, ye=ye, review_df=review_df, checkin_df=checkin_df)

    def recommend(self, top_n=50, filters={}, elastic=True):
        return self.pop.recommend(top_n=top_n, filters=filters, elastic=elastic)

## 3.2 Test

In [77]:
review_df = pd.read_csv("/home/hongphuc95/notebookteam/fu/review.csv")

In [106]:
engine_pop = EnginePopularity(business_df=business_df, ye=ye, review_df=review_df)

In [113]:
filters={}
filters["nearby"] = business_nearby
filters["categories"] = "tacos"

In [27]:
#nearby_df = filters["nearby"]
#nearby_ids = nearby_df["business_id"].values

In [32]:
body = {
"size" : 0,
"query":{
    "bool": {
      "must": [
        {"range": {
          "date": {
            "gte": "2016-01-01",
            "lte": "2018-12-31"
          }
        }},
        {
          "terms": {
            "business_id.keyword": np.unique(nearby_ids).tolist()
          }   
        }
      ]
    }
  },
  "aggs": {
    "fil_agg": {
      "terms": {
        "field": "business_id.keyword"
      },
      "aggs": {
        "sum_rating": {
          "sum": {
            "field": "stars"
          }
        }
      }
    }
  }
}

In [34]:
result = ye.es.search(
            body=body,
            index="yelp-review",
            size=0,
            _source_excludes=[],
            _source_includes=[])

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-review/_search?_source_excludes=&_source_includes=&size=0 [status:200 request:0.122s]


In [35]:
result

{'took': 82,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'fil_agg': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 74612,
   'buckets': [{'key': 'KskYqH1Bi7Z_61pH6Om8pg',
     'doc_count': 1336,
     'sum_rating': {'value': 5709.0}},
    {'key': '2iTsRqUsPGRH1li1WVRvKQ',
     'doc_count': 1295,
     'sum_rating': {'value': 5809.0}},
    {'key': 'eJKnymd0BywNPrJw1IuXVw',
     'doc_count': 1152,
     'sum_rating': {'value': 4910.0}},
    {'key': '2sx52lDoiEtef7xgPCaoBw',
     'doc_count': 1119,
     'sum_rating': {'value': 4958.0}},
    {'key': '3GEEy7RP6e4bT4LAiWFMFQ',
     'doc_count': 1074,
     'sum_rating': {'value': 4464.0}},
    {'key': 'l_GV0hgEoTUf70uJVT0_hg',
     'doc_count': 1023,
     'sum_rating': {'value': 4276.0}},
    {'key': 'UidEFF1WpnU4duev4fjPlQ',
     'doc_count': 942,
     'sum_rating': {'value': 

In [114]:
start = time.time()
rec_pop = engine_pop.recommend(filters=filters, top_n=50, elastic=False)
print("Done popularity recommendation in %s seconds." % (time.time() - start))

Done popularity recommendation in 0.187774658203125 seconds.


In [117]:
rec_pop

In [115]:
rec_pop.drop(columns=["sum", "count", "ratings_avg"], inplace=True)

KeyError: "['sum' 'count' 'ratings_avg'] not found in axis"

In [116]:
rec_pop

In [111]:
rec_pop = business_details(business_df, rec_pop)

In [112]:
rec_pop

Unnamed: 0,business_id,name,categories,address,city,state,latitude,longitude,stars,review_count,pop_score,distance,geo_score,score
0,e0JaE8S3MdZijvrf_PAYjQ,El Chapo,"Cafes, Themed Cafes, Tacos, Restaurants, Mexican",36 Toronto Street,Toronto,ON,43.650579,-79.376537,3.0,16,0.0,0.6771,0.433395,0.433395
1,vQGZjrDiDBuznOkUevBNzw,Los Vietnamita Taqueria Shop,"Restaurants, Tacos, Asian Fusion, Mexican, Vie...",,Toronto,ON,43.653133,-79.382928,4.5,4,0.5,0.0898,0.49228,0.99228


In [43]:
rec_pop[["name", "categories", "city", "distance", "pop_score", "geo_score", "score"]]

Unnamed: 0,name,categories,city,distance,pop_score,geo_score,score
0,Bridger Inn,"Hotels, Event Planning & Services, Hotels & Tr...",Las Vegas,0.2102,0.3125,0.481077,0.793577
1,Segway Las Vegas,"Tours, Motorcycle Rental, Hotels & Travel, Act...",Las Vegas,0.6738,0.5,0.434513,0.934513
2,Enterprise Rent-A-Car,"Hotels & Travel, Car Rental",Las Vegas,0.6005,0.269231,0.441875,0.711106
3,Cowboy Trail Rides,"Hotels & Travel, Tours",Las Vegas,2.953,0.5,0.20559,0.70559
4,Las Vegas Club Hotel & Casino,"Casinos, Hotels & Travel, Event Planning & Ser...",Las Vegas,0.593,0.3125,0.442629,0.755129
5,Garden Court Buffet,"Restaurants, Hotels, Event Planning & Services...",Las Vegas,0.8199,0.306383,0.419839,0.726222
6,Taxi Service,"Taxis, Tours, Hotels & Travel, Wine Tours, Tra...",Las Vegas,0.835,0.35,0.418322,0.768322
7,Premier Club Tours,"Adult Entertainment, Nightlife, Bars, Party Bu...",Las Vegas,0.6976,0.410156,0.432123,0.842279
8,Neon Museum,"Hotels & Travel, Shopping, Arts & Entertainmen...",Las Vegas,1.6025,0.40871,0.341235,0.749945
9,Place on 7th,"Hotels & Travel, Event Planning & Services, Ho...",Las Vegas,0.9994,0.3625,0.40181,0.76431


In [44]:
show_map(loc=engine_geo.get_current_location(), popularity=rec_pop)

In [28]:
#TODO isopen = 1

**Merge business informations**

In [29]:
rec_pop = business_details(business_df, rec_pop)

In [30]:
rec_pop

Unnamed: 0,business_id,name,categories,address,city,state,latitude,longitude,stars,review_count,sum,count,ratings_avg,pop_score,distance,geo_score,score
0,j6D75GAJu6LdvRR1_OsxnA,Bridger Inn,"Hotels, Event Planning & Services, Hotels & Tr...",301 S Main St,Las Vegas,NV,36.169008,-115.147637,3.5,10,28.0,8,3.5,0.3125,0.2102,0.481077,0.793577
1,ovFnrmsT5rUpbAQE1r-V0Q,Segway Las Vegas,"Tours, Motorcycle Rental, Hotels & Travel, Act...",901 S Main St,Las Vegas,NV,36.161839,-115.151881,5.0,38,70.0,14,5.0,0.5,0.6738,0.434513,0.934513
2,ER7lT5gKBN-WpNkSIChQAQ,Enterprise Rent-A-Car,"Hotels & Travel, Car Rental",301 Fremont St,Las Vegas,NV,36.170154,-115.142871,3.0,16,41.0,13,3.153846,0.269231,0.6005,0.441875,0.711106
3,i42M5aLwhsCSRD0eWrp50g,Cowboy Trail Rides,"Hotels & Travel, Tours",1211 S Eastern Ave,Las Vegas,NV,36.156127,-115.118651,5.0,5,15.0,3,5.0,0.5,2.953,0.20559,0.70559
4,MYD64NGYbF0n7sQZ-I4o5g,Las Vegas Club Hotel & Casino,"Casinos, Hotels & Travel, Event Planning & Ser...",18 E Fremont St,Las Vegas,NV,36.172075,-115.145689,2.0,97,7.0,2,3.5,0.3125,0.593,0.442629,0.755129
5,EUWBT5GDxPC95w9itZ1EHw,Garden Court Buffet,"Restaurants, Hotels, Event Planning & Services...",200 N Main St,Las Vegas,NV,36.174097,-115.14511,3.5,556,811.0,235,3.451064,0.306383,0.8199,0.419839,0.726222
6,AZ4JC4-YOIWhOOOSo0AcMw,Taxi Service,"Taxis, Tours, Hotels & Travel, Wine Tours, Tra...",,Las Vegas,NV,36.169941,-115.13983,4.0,12,38.0,10,3.8,0.35,0.835,0.418322,0.768322
7,PWFB8WyJGaFBWfcrwcn9Lw,Premier Club Tours,"Adult Entertainment, Nightlife, Bars, Party Bu...",100 S Las Vegas Blvd,Las Vegas,NV,36.168946,-115.141032,4.5,47,137.0,32,4.28125,0.410156,0.6976,0.432123,0.842279
8,g83WbX_recywc4DEIZ-xug,Neon Museum,"Hotels & Travel, Shopping, Arts & Entertainmen...",770 North Las Vegas Blvd,Las Vegas,NV,36.176986,-115.135346,4.5,998,2549.0,597,4.269682,0.40871,1.6025,0.341235,0.749945
9,h_A7TFaKasHmrc9s_hhb5w,Place on 7th,"Hotels & Travel, Event Planning & Services, Ho...",115 7th St,Las Vegas,NV,36.168895,-115.13757,4.0,10,39.0,10,3.9,0.3625,0.9994,0.40181,0.76431


In [31]:
show_map(loc=engine_geo.get_current_location(), popularity=rec_pop)

<a id='content'></a>
# 4. Content Based Keyword

[Back to menu](#menu)

## 4.1 Methods

In [489]:
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics.pairwise import cosine_similarity

data_path = "/home/hongphuc95/notebookteam/dataset/"
api_path = "/home/hongphuc95/notebookteam/api/"

class ContentExtact:

    def __init__(self, ye, business_df):
        self.ye = ye
        self.business_df = business_df

    ##########################################################
    #                     Loading tools                      #
    ##########################################################
    def load(self):
        self.load_text_to_vec()
        self.load_word2vec()

    ##########################################################
    #                    Text Processing                     #
    ##########################################################

    def clean_text(self, text):
        all_stopwords = stopwords.words('english')
        text_tokens = word_tokenize(text.lower())
        tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
        return tokens_without_sw

    def text_processing(self):
        logger.info("Cleaning text in progress...")
        start = time.time()
        review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019.json", lines=True)
        review_by_business = review_df.groupby('business_id')['text'].agg(lambda col: ' '.join(col)).reset_index()
        review_by_business["text"] = review_by_business["text"].apply(lambda x: self.clean_text(x))
        logger.info("Done cleaning text in %s seconds." % (time.time() - start))
        return review_by_business

    ##########################################################
    #                        Word2Vec                        #
    ##########################################################
    def train_word2vec(self, review_by_business=None, save=True):
        logger.info("Training Word2Vec Model in progress...")
        start = time.time()
        model = gensim.models.Word2Vec(review_by_business["text"], min_count=5, size=200, workers=4)
        logger.info("Done Word2Vec text in %s seconds." % (time.time() - start))
        self.model = model
        if save:
            self.save_word2vec(self.model)

    def save_word2vec(self, model):
        model.save(api_path + "trained/review_full.model")

    def load_word2vec(self):
        try:
            self.model = Word2Vec.load(api_path + "trained/review_full.model")
            logger.info("Reviews vectorized loaded")
        except (FileNotFoundError, IOError):
            self.model = None
            logger.info("File not found")

    ##########################################################
    #                        TextToVec                       #
    ##########################################################

    def text_to_vec(self, save=True):
        logger.info("Convert texts to vectors in progress...")
        start = time.time()
        self.load_word2vec()
        review_by_business = self.text_processing()

        if not self.model:
            self.train_word2vec(review_by_business=review_by_business, save=True)

        # Clean DF
        review_by_business["text_vec"] = review_by_business["text"].apply(
            lambda x: self.avg_feature_vector(x, model=self.model, n_features=200))

        review_by_business.drop("text", axis=1, inplace=True)
        if save:
            self.save_text_to_vec(review_by_business)
        self.docvecs = review_by_business
        logger.info("Done converting texts to vectors in %s seconds." % (time.time() - start))


    def save_text_to_vec(self, review_by_business):
        with open(api_path + "trained/review_2016_2019_full_vectorized.pickle", "wb") as f:
            pickle.dump(review_by_business, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_text_to_vec(self):
        try:
            with open(api_path + "trained/review_2016_2019_full_vectorized.pickle", "rb") as f:
                self.docvecs = pickle.load(f)
                logger.info("Reviews vectorized loaded")
        except (FileNotFoundError, IOError):
            self.text_to_vec()

    def avg_feature_vector(self, sentence, model, n_features):
        index2word_set = set(model.wv.index2word)
        feature_vec = np.zeros((n_features,), dtype='float32')
        n_words = 0
        for word in sentence:
            if word in index2word_set:
                n_words += 1
                feature_vec = np.add(feature_vec, model.wv[word])
        if (n_words > 0):
            feature_vec = np.divide(feature_vec, n_words)
        return feature_vec

    ##########################################################
    #                        Recommend                       #
    ##########################################################

    def keyword_recommend(self, input_str, top_n=20, filters={}, content_w=0.8, geo_w=0.2):
        docvecs = self.docvecs
        nearby_df = pd.DataFrame()

        if filters:
            mask = np.array([])
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if mask.size == 0:
                    mask = nearby_ids
                else:
                    mask = np.intersect1d(mask, nearby_ids)

            #if "categories" in filters and filters["categories"]:
            #    mask_cat = self.business_df[self.business_df["categories"].str.contains(filters["categories"])][
            #        "business_id"].values
            #    if mask.size == 0:
            #        mask = mask_cat
            #    else:
            #        mask = np.intersect1d(mask, mask_cat)

            docvecs = docvecs[docvecs["business_id"].isin(mask)]

        from nltk.tokenize import word_tokenize
        input_vec = pd.DataFrame({"text": [input_str]})
        input_vec["text"] = input_vec["text"].apply(lambda x: word_tokenize(x.lower()))
        input_vec["text_vec"] = input_vec["text"].apply(
            lambda x: self.avg_feature_vector(x, model=self.model, n_features=200))
        business_similarity = docvecs[["business_id", "text_vec"]]

        # compute similarity array
        business_similarity["score"] = business_similarity["text_vec"].apply(
            lambda x: cosine_similarity([x], [input_vec["text_vec"].values[0]])[0][0])
        business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False,
                                                                                         by="score")
        #print("Shape business similarity before: %s" % (business_similarity.shape))
        if "nearby" in filters and not filters["nearby"].empty:
            business_similarity = business_similarity.rename(columns={"score": "content_score"})
            business_similarity = pd.merge(left=business_similarity, right=nearby_df, how="inner", on="business_id")
            business_similarity["score"] = content_w * business_similarity["content_score"] + geo_w * business_similarity["geo_score"]
            business_similarity = business_similarity.sort_values(by="score", ascending=False)            
            #print("Shape business similarity after: %s" % (business_similarity))
            
        business_similarity = business_similarity.head(top_n)
        return business_similarity

    def business_similarity(self, business_ids, top_n, filters={}, content_w=0.8, geo_w=0.2):

        docvecs = self.docvecs
        businesses_similarity = pd.DataFrame()
        nearby_df = pd.DataFrame()

        if "categories" in filters and filters["categories"]:
            cat_ids = self.business_df[self.business_df["categories"].str.contains(filters["categories"])]["business_id"].values
            if cat_ids.size > 0:
                business_ids = np.intersect1d(business_ids, cat_ids)
                
            print(business_ids)
                
        if business_ids.size > 0:
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                nearby_ids = nearby_df["business_id"].values
                if nearby_ids.size > 0:
                    mask = np.append(nearby_ids, business_ids)
                    docvecs = docvecs[docvecs["business_id"].isin(mask)]
                                      
            business_ids = business_ids[:5]
            for business_id in business_ids:
                business_similarity = pd.DataFrame()
                input_vec = docvecs[docvecs["business_id"] == business_id]
                business_similarity = docvecs[["business_id", "text_vec"]]

                # compute similarity array
                business_similarity["score"] = business_similarity["text_vec"].apply(
                    lambda x: cosine_similarity([x], [input_vec["text_vec"].values[0]])[0][0])
                business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False,
                                                                                             by="score")
                
                
                # Filter business with the same id
                business_similarity = business_similarity[~business_similarity["business_id"].isin(business_ids)]
                business_similarity["input_business_id"] = business_id
                
                if "nearby" in filters and not filters["nearby"].empty:
                    business_similarity = business_similarity.rename(columns={"score": "content_score"})
                    business_similarity = pd.merge(left=business_similarity, right=nearby_df, how="inner", on="business_id")
                    business_similarity["score"] = content_w * business_similarity["content_score"] + geo_w * \
                                                   business_similarity["geo_score"]
                    business_similarity = business_similarity.sort_values(by="score", ascending=False)
                business_similarity = business_similarity.head(top_n)
                businesses_similarity = businesses_similarity.append(business_similarity)
        
        return business_ids, businesses_similarity

    def content_recommend(self, user_id, top_n, filters={}, content_w=0.8, geo_w=0.2):
        logger.debug("Starting Content Based Userid...")
        start = time.time()
        
        docvecs = self.docvecs

        mustArray = [
            self.ye.bodySingleMatch("user_id", user_id),
            self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31"),
            self.ye.bodyRange("stars", gteValue="3")
        ]
        review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review",
                                                      mustArray=mustArray, filterArray=[],
                                                      exclude=["text", "@timestamp", "@version", "cool", "useful",
                                                               "funny"], size=2000)

        print("Total reviews retrieved: %d" % (review_fisrt_chunk["hits"]["total"]["value"]))
        review_df = self.ye.getResultScrolling(review_fisrt_chunk)
        review_df = review_df.sort_values(by="date", ascending=False)
        # review_df = alreadyreviwed(user_id=user_id, review_df=self.review_df)
        # review_df = review_df[review_df["stars"] >= 3].sort_values(by="date", ascending=False)

        # print("Businesses previously reviewed by user: ")
        # already_reviewed = business_details(self.business_df, review_df)
        # print(already_reviewed[["name", "business_id", "categories"]])

        business_reviewed_ids = review_df["business_id"].unique()
        business_reviewed_ids, sim_business_df = self.business_similarity(business_reviewed_ids, top_n=top_n, filters=filters,
                                                   content_w=content_w, geo_w=geo_w)

        logger.debug("Done Content Based Userid in %s seconds." % (time.time() - start))
        return business_reviewed_ids, sim_business_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [490]:
class EngineCB:

    def __init__(self, ye, business_df):
        logger.info("Initilizing Content Based Engine")
        self.content = ContentExtact(ye=ye, business_df=business_df)

    def load(self):
        self.content.load()

    def train_word2vec(self):
        self.train_word2vec()

    def train_texttovec(self):
        self.content.text_to_vec()

    def keyword_recommend(self, input_str, top_n=10, filters={}):
        return self.content.keyword_recommend(input_str=input_str, top_n=top_n, filters=filters)

    def content_recommend(self, user_id, top_n=10, filters={}):
        return self.content.content_recommend(user_id=user_id, top_n=top_n, filters=filters)

## 4.2 Test

In [491]:
#hm = engine_cb.content_recommend(user_id="EC5nxNCWCmjHg1F14WrlxQ", nearby_businesses=business_ids_nearby, top_n=10)

In [492]:
filters={}
filters["nearby"] = business_nearby
filters["categories"] = "Restaurants"

In [493]:
engine_cb = EngineCB(ye=ye, business_df=business_df)

INFO:__main__:Initilizing Content Based Engine


In [494]:
engine_cb.load()

INFO:__main__:Reviews vectorized loaded
INFO:gensim.utils:loading Word2Vec object from /home/hongphuc95/notebookteam/api/trained/review_full.model
INFO:gensim.utils:loading wv recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.wv.* with mmap=None
INFO:gensim.utils:loading vectors from /home/hongphuc95/notebookteam/api/trained/review_full.model.wv.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loading vocabulary recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.vocabulary.* with mmap=None
INFO:gensim.utils:loading trainables recursively from /home/hongphuc95/notebookteam/api/trained/review_full.model.trainables.* with mmap=None
INFO:gensim.utils:loading syn1neg from /home/hongphuc95/notebookteam/api/trained/review_full.model.trainables.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded /home/hongphuc95/notebook

In [495]:
#filters={}
#filters["nearby"] = business_nearby

In [496]:
#start = time.time()
#rec_cb = engine_cb.keyword_recommend(input_str="tacos", top_n=20, filters=filters)
#print("Done keyword recommendation in %s seconds." % (time.time() - start))

In [497]:
#rec_cb = business_details(business_df, rec_cb)

In [498]:
#rec_cb[["name", "categories", "city", "distance", "content_score", "geo_score", "score"]]

In [499]:
#show_map(loc=engine_geo.get_current_location(), contentbased=rec_cb)

In [500]:
#"EC5nxNCWCmjHg1F14WrlxQ", AyjqBovADgbskmLrIBOMlQ

In [501]:
start = time.time()
reviewed_df, rec_cb = engine_cb.content_recommend(user_id="S_7OkmN0BicgWEt2oMmzIQ", top_n=20, filters=filters)
print("Done Content Based Userid in %s seconds." % (time.time() - start))

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-review/_search?_source_excludes=text%2C%40timestamp%2C%40version%2Ccool%2Cuseful%2Cfunny&_source_includes=&scroll=1m&size=2000 [status:200 request:0.018s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.014s]


Total reviews retrieved: 37
['-pG5srxtnRkd1bdIYn78xA' '1ZnVfS-qP19upP_fwOhZsA'
 '2PS9kBbuJcmBhcNp-D62uA' '6H8xfhoZ2IGa3eNiY5FqLA'
 '6nbPXEjmBnDtsgM_YNwGjw' '6tSvz_21BMo3a4GaItwa0g'
 'D6K3VT6S8FwFm4u5uN8T8g' 'DYAorbxOyubUB_wtQRCdug'
 'HaWU8ICNtjIkCN_ZNrx1hw' 'OAbwqq66Sc2JYkO6b6rEDw'
 'PglC8rgguMIlT621p1BLdQ' 'U0UnBjz9DugW2qRZputKBg'
 'V-0qRzBHKixmQgon_fW_AA' 'VD0zZKG_ZEz_VYlKLtu9Zg'
 'WETHRoXB2IbixndSlOktJA' 'ZyOLevFrV7Vxi0OOS3lf-w'
 'bgx6gYdktqEoQwBdo5lRbA' 'c5N-XCk-ntpyYRsYkwH7kA'
 'f4vb5tv60WaBcwzRhKpANA' 'hZDNYOfZnVp10pFsycr8rg'
 'l1GJnB9TJgGgEeI4at1M0A' 'mD8bJbumIGOtlXXuvVNuqw'
 'n_K2xa_nqzUqaUJa0S_5tQ' 'ozMd45nXHTCl3mdOwouwlw'
 'rioQ_p2pILNbJ4Xp5jW6-Q' 'w5CSi-An5meLnxjKSFn0wQ'
 'wX1ORSDKPkA5ftE7F6UONw' 'xd-AnaGxEMKT2zTEheP6_w']
Done Content Based Userid in 3.7520105838775635 seconds.


In [488]:
reviewed_df

array(['-pG5srxtnRkd1bdIYn78xA', '1ZnVfS-qP19upP_fwOhZsA',
       '2PS9kBbuJcmBhcNp-D62uA', '6H8xfhoZ2IGa3eNiY5FqLA',
       '6nbPXEjmBnDtsgM_YNwGjw'], dtype=object)

In [459]:
rec_cb = business_details(business_df, rec_cb)

In [460]:
rec_cb[["input_business_id", "business_id", "name", "categories", "distance", "content_score", "geo_score", "score"]]

Unnamed: 0,input_business_id,business_id,name,categories,distance,content_score,geo_score,score
0,BFibmswPtBNyChp8vBHRsg,h_ypoQ2rmwX8UGSmohsGiQ,Maman,"Event Planning & Services, American (Tradition...",0.5157,0.964839,0.899156,0.951702
1,Ia1JlEo7UUyZtmqRe1K1Pw,h_ypoQ2rmwX8UGSmohsGiQ,Maman,"Event Planning & Services, American (Tradition...",0.5157,0.947793,0.899156,0.938065
2,BFibmswPtBNyChp8vBHRsg,fHHQ9s6wWPkTMyNVq-0SHQ,JJ Bean,"Coffee & Tea, Coffee Roasteries, Food, Bakeries",0.4267,0.941163,0.917003,0.936331
3,BFibmswPtBNyChp8vBHRsg,LpwmR1unntc_8KdVfwNH9g,Bulldog Coffee,"Restaurants, Shopping, Cafes, Public Markets, ...",0.3482,0.952092,0.932744,0.948222
4,n8Zqqhff-2cxzWt_nwhU2Q,F-bdXFkJwwENiNpMTG2ntQ,The 3 Brewers,"Breweries, Bars, Pubs, Food, Restaurants, Gast...",0.3752,0.982566,0.927330,0.971519
...,...,...,...,...,...,...,...,...
95,0_VT3sTwi7gorIlU36ASmg,jF3sWSqa-6TyROZgSUGrIw,Rock 'n' Horse Saloon,"Soul Food, Pizza, Food, Music Venues, Restaura...",0.7181,0.910143,0.858569,0.899828
96,0_VT3sTwi7gorIlU36ASmg,Nso8kLJGYhJSlO1uZFnKvg,Bar 244,"Dance Clubs, Restaurants, Bars, Nightlife, Ame...",0.7052,0.905849,0.861156,0.896910
97,BFibmswPtBNyChp8vBHRsg,i78im_1r1ZQbwqLjVxZxpg,Aroma Espresso Bar,"Breakfast & Brunch, Coffee & Tea, Food, Restau...",0.1992,0.932832,0.962622,0.938790
98,Ia1JlEo7UUyZtmqRe1K1Pw,i78im_1r1ZQbwqLjVxZxpg,Aroma Espresso Bar,"Breakfast & Brunch, Coffee & Tea, Food, Restau...",0.1992,0.966550,0.962622,0.965764


In [None]:
rec_cb = rec_cb[rec_cb["input_business_id"] == "ZRsDVZmMjE8jLqHivluWLA"]

In [None]:
show_map(loc=simulation_loc, contentbased=rec_content)

<a id='cf'></a>
# 5. Collaorative Filtering

[Back to menu](#menu)

## 5.1 Model Based SVD
<a id='cf_model'></a>

In [11]:
#review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019_Vegas.json", lines=True)

In [23]:
review_df = pd.read_csv("/home/hongphuc95/notebookteam/fu/review.csv")

In [24]:
from scipy import sparse
from scipy.sparse.linalg import svds
from sklearn.preprocessing import LabelEncoder

class CustomSVD:
    def __init__(self, review_df):
        self.user_active_df = self.get_active_user(review_df)

    def get_active_user(self, review_df):
        n_active = 40
        user_review_df_count = review_df.groupby("user_id").size()
        user_review_active_values = user_review_df_count[user_review_df_count >= n_active].reset_index()[
            "user_id"].values
        user_active_df = review_df[review_df["user_id"].isin(user_review_active_values)]
        user_active_df = user_active_df.groupby(["user_id", "business_id"], as_index=False).mean()
        return user_active_df

    def create_utility_matrix(self, df):
        user_encoder = LabelEncoder()
        business_encoder = LabelEncoder()

        user_ids = pd.DataFrame(columns=["user_id_matrix"])
        business_ids = pd.DataFrame(columns=["business_id_matrix"])
        df["user_id_matrix"] = user_encoder.fit_transform(df['user_id'])
        df["business_id_matrix"] = business_encoder.fit_transform(df['business_id'])

        user_ids["user_id_matrix"] = df["user_id_matrix"].unique()
        business_ids["business_id_matrix"] = df["business_id_matrix"].unique()
        user_ids["user_id"] = user_encoder.inverse_transform(user_ids["user_id_matrix"])
        business_ids["business_id"] = business_encoder.inverse_transform(business_ids["business_id_matrix"])

        return df, user_ids, business_ids

    def create_sparse_matrix(self, user_active_matrix, user_ids, business_ids):
        highest_user_id = len(user_ids['user_id'].unique())
        highest_business_id = len(business_ids['business_id'].unique())
        shape_matrix = (highest_user_id, highest_business_id)
        ratings_mat = sparse.lil_matrix(shape_matrix)
        for i, row in user_active_matrix.iterrows():
            ratings_mat[row["user_id_matrix"], row["business_id_matrix"]] = row["stars"]

        user_ratings_mean = np.mean(ratings_mat, axis=1)
        ratings_mat = ratings_mat - user_ratings_mean.reshape(-1, 1)
        return user_ratings_mean, ratings_mat

    def svd_pred(self, ratings_mat, user_ratings_mean, n_factor=20):
        U, sigma, Vt = svds(ratings_mat, k=n_factor)
        sigma = np.diag(sigma)
        all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

        #Similar user matrix
        U_df = pd.DataFrame(U)
        user_similarity = cosine_similarity(U_df)
        user_similarity_df = pd.DataFrame(user_similarity, index=U_df.index, columns=U_df.index)
        # Normalization
        # all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (
        #        all_user_predicted_ratings.max() - all_user_predicted_ratings.min())
        return user_similarity_df, all_user_predicted_ratings

    def fit(self, n_factor=20):
        logger.info("Start building matrix")
        start = time.time()
        self.cf_preds_df = None

        user_active_matrix, self.user_ids, self.business_ids = self.create_utility_matrix(self.user_active_df)
        user_ratings_mean, rating_mat = self.create_sparse_matrix(user_active_matrix, self.user_ids, self.business_ids)
        self.cf_user_similarity_df, self.cf_preds_df = self.svd_pred(ratings_mat=rating_mat, user_ratings_mean=user_ratings_mean, n_factor=n_factor)

        logger.info("Matrix built in %s seconds." % (time.time() - start))

    def transform(self, user_id, filters={}, cf_w=0.8, geo_w=0.2, topn=1000):
        logger.info("Start predicting")
        start = time.time()
        recommendations_df = pd.DataFrame()

        user_id_num = self.user_ids[self.user_ids["user_id"] == user_id].user_id_matrix.values
        if user_id_num.size > 0:
            user_id_num = user_id_num.astype(int)[0]
            #Prediction user matrix
            pred_user = self.cf_preds_df[user_id_num, :]
            #Similar to user matrix
            similar_users = self.cf_user_similarity_df[user_id_num].sort_values(ascending=False)[:20]
            similar_users_df = pd.DataFrame({"score":similar_users}).reset_index()
            user_id_index = pd.merge(left=similar_users_df, right=self.user_ids, how="inner", left_on="index", right_on="user_id_matrix")
            user_id_index.drop(columns=["index", "user_id_matrix"], inplace=True)
            
            sorted_user_predictions = pd.DataFrame(pred_user.T, columns=["score"])
            sorted_user_predictions = pd.merge(sorted_user_predictions, self.business_ids, left_index=True,
                                               right_on="business_id_matrix") \
                .drop(columns=["business_id_matrix"]) \
                .sort_values(ascending=False, by="score") \
                .reset_index(drop=True)

            items_to_ignore = self.user_active_df[self.user_active_df["user_id"] == user_id]
            items_to_ignore = items_to_ignore["business_id"].unique()

            # Recommend the highest predicted rating movies that the user hasn't seen yet.
            recommendations_df = sorted_user_predictions[~sorted_user_predictions['business_id'].isin(items_to_ignore)] \
                .sort_values('score', ascending=False)
            
            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                recommendations_df = pd.merge(left=recommendations_df, right=nearby_df, how="inner", on="business_id")
                if not recommendations_df.empty:
                    recommendations_df = recommendations_df.rename(columns={"score": "cf_score"})
                    recommendations_df["score"] = cf_w * recommendations_df["cf_score"] + geo_w * recommendations_df[
                        "geo_score"]
                    recommendations_df = recommendations_df.sort_values(by="score", ascending=False)

            recommendations_df = recommendations_df.head(topn)
            logger.info("Prediction done in %s seconds." % (time.time() - start))
            return user_id_index, recommendations_df
        else:
            logger.info("This user %s is not exist" % (user_id))
            return pd.DataFrame(), recommendations_df

    def rmse(self, true, pred):
        # this will be used towards the end
        x = true - pred
        return sum([xi * xi for xi in x]) / len(x)

In [104]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF
from scipy import sparse

class CustomNMF:
    def __init__(self, review_df):
        self.user_active_df = self.get_active_user(review_df)

    def get_active_user(self, review_df):
        n_active = 40
        user_review_df_count = review_df.groupby("user_id").size()
        user_review_active_values = user_review_df_count[user_review_df_count >= n_active].reset_index()[
            "user_id"].values
        user_active_df = review_df[review_df["user_id"].isin(user_review_active_values)]
        user_active_df = user_active_df.groupby(["user_id", "business_id"], as_index=False).mean()
        return user_active_df

    def create_utility_matrix(self, df):
        user_encoder = LabelEncoder()
        business_encoder = LabelEncoder()

        user_ids = pd.DataFrame(columns=["user_id_matrix"])
        business_ids = pd.DataFrame(columns=["business_id_matrix"])
        df["user_id_matrix"] = user_encoder.fit_transform(df['user_id'])
        df["business_id_matrix"] = business_encoder.fit_transform(df['business_id'])

        user_ids["user_id_matrix"] = df["user_id_matrix"].unique()
        business_ids["business_id_matrix"] = df["business_id_matrix"].unique()
        user_ids["user_id"] = user_encoder.inverse_transform(user_ids["user_id_matrix"])
        business_ids["business_id"] = business_encoder.inverse_transform(business_ids["business_id_matrix"])

        return df, user_ids, business_ids

    def create_sparse_matrix(self, user_active_matrix, user_ids, business_ids):
        highest_user_id = len(user_ids['user_id'].unique())
        highest_business_id = len(business_ids['business_id'].unique())
        shape_matrix = (highest_user_id, highest_business_id)
        ratings_mat = sparse.lil_matrix(shape_matrix)
        for i, row in user_active_matrix.iterrows():
            ratings_mat[row["user_id_matrix"], row["business_id_matrix"]] = row["stars"]
        return ratings_mat

    def nmf_pred(self, ratings_mat, n_factor=20): 
        nmf = NMF(n_components=n_factor)
        nmf.fit(ratings_mat)
        #Features
        W = nmf.transform(ratings_mat)
        #Features Weight
        H = nmf.components_
        #Reconstructed matrix
        ratings_mat_fitted = W.dot(H)
        #Error
        self.error = nmf.reconstruction_err_

        return ratings_mat_fitted

    def fit(self, n_factor=50):
        logger.info("Start building matrix")
        start = time.time()
        self.cf_preds_df = None

        user_active_matrix, self.user_ids, self.business_ids = self.create_utility_matrix(self.user_active_df)
        rating_mat = self.create_sparse_matrix(user_active_matrix, self.user_ids, self.business_ids)
        self.cf_preds_df = self.nmf_pred(ratings_mat=rating_mat, n_factor=n_factor)

        logger.info("Matrix built in %s seconds." % (time.time() - start))

    def transform(self, user_id, filters={}, cf_w=0.8, geo_w=0.2, topn=1000):
        logger.info("Start predicting")
        start = time.time()
        recommendations_df = pd.DataFrame()

        user_id_num = self.user_ids[self.user_ids["user_id"] == user_id].user_id_matrix.values
        if user_id_num.size > 0:
            user_id_num = user_id_num.astype(int)[0]
            pred_user = self.cf_preds_df[user_id_num, :]
            sorted_user_predictions = pd.DataFrame(pred_user.T, columns=["score"])
            sorted_user_predictions = pd.merge(sorted_user_predictions, self.business_ids, left_index=True,
                                               right_on="business_id_matrix") \
                .drop(columns=["business_id_matrix"]) \
                .sort_values(ascending=False, by="score") \
                .reset_index(drop=True)

            items_to_ignore = self.user_active_df[self.user_active_df["user_id"] == user_id]
            items_to_ignore = items_to_ignore["business_id"].unique()

            # Recommend the highest predicted rating movies that the user hasn't seen yet.
            recommendations_df = sorted_user_predictions[~sorted_user_predictions['business_id'].isin(items_to_ignore)] \
                .sort_values('score', ascending=False)

            if "nearby" in filters and not filters["nearby"].empty:
                nearby_df = filters["nearby"]
                recommendations_df = pd.merge(left=recommendations_df, right=nearby_df, how="inner", on="business_id")
                if not recommendations_df.empty:
                    recommendations_df = recommendations_df.rename(columns={"score": "cf_score"})
                    recommendations_df["score"] = cf_w * recommendations_df["cf_score"] + geo_w * recommendations_df[
                        "geo_score"]
                    recommendations_df = recommendations_df.sort_values(by="score", ascending=False)

            recommendations_df = recommendations_df.head(topn)

            logger.info("Prediction done in %s seconds." % (time.time() - start))
            return items_to_ignore, recommendations_df
        else:
            logger.info("This user %s is not exist" % (user_id))
            return np.array([]), recommendations_df

    def get_error(self):
        return self.error

## 5.2 Memory Based Friendlist
<a id='cf_memory'></a>

In [105]:
class FriendSim:

    def __init__(self, ye):
        self.ye = ye
        
    def find_n_neighbours(self, df, nrows, n):
        if n>nrows:
            n = nrows
        order = np.argsort(df.values, axis=1)[:, :n]
        df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
               .iloc[:n].index, 
              index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
        return df

    def standardize(self, row):
        new_row = (row - row.mean()) / (row.max() - row.min())
        return new_row

    def get_active_user(self, review_df, n_active=10):
        user_review_df_count = review_df.groupby("user_id").size()
        user_review_active_values = user_review_df_count[user_review_df_count >= n_active].reset_index()[
            "user_id"].values
        user_active_df = review_df[review_df["user_id"].isin(user_review_active_values)]
        user_active_df = user_active_df.groupby(["user_id", "business_id"], as_index=False).mean()
        return user_active_df

    def recommend(self, user_id,topn=50):
        
        top_recommendation = pd.DataFrame()
        
        #Get friend of the user in the parameter
        mustArray = [
            self.ye.bodySingleMatch("user_id", user_id)
        ]
        include_list = ["name", "user_id", "friends", "yelping_since", "review_count",
                        "average_stars", "elite"]
        user_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-user*",
                                                 mustArray=mustArray, filterArray=[],
                                                 include = include_list, size=2000)

        print("Total reviews retrieved: %d" % (user_fisrt_chunk["hits"]["total"]["value"]))
        user_df = self.ye.getResultScrolling(user_fisrt_chunk)
        friends_str = user_df["friends"].values[0]

        if friends_str:
            friends_list = np.unique(friends_str.split(", "))
            print("This user has %d friends" % (len(friends_list)))
            friends_list = np.append(friends_list, np.array(user_id))
            mustArray = [
                self.ye.bodyMultivalueTerm("user_id.keyword", friends_list.tolist()),
                self.ye.bodyRange("date", gteValue="2016-01-01", lteValue="2018-12-31")

            ]
            review_fisrt_chunk = self.ye.getComplexeQuery(index="yelp-review*",
                                                     mustArray=mustArray, filterArray=[],
                                                     include=["user_id", "business_id", "stars"], size=2000)

            review_df = self.ye.getResultScrolling(review_fisrt_chunk)
            user_active_df = self.get_active_user(review_df=review_df)
            
            #check if user is still in user_active_df (cold start)
            if ( any(user_active_df.user_id== user_id) == False):
                return pd.DataFrame()
            
            #retrieve all businesses seen by user to not recommend them later
            businesses_seen_by_user = user_active_df[user_active_df.user_id == user_id].business_id.unique()
            
            
            #************ old method *****************
            Mean = user_active_df.groupby(by="user_id",as_index=False)['stars'].mean()
            Mean.rename(columns={'stars': 'mean'}, inplace=True)
            Rating_avg = pd.merge(user_active_df,Mean,on='user_id')
            Rating_avg['adg_rating']=Rating_avg['stars']-Rating_avg['mean']
            
            #Building matrix by pivot table
            rating_mat = pd.pivot_table(
                data=Rating_avg,
                index="user_id",
                columns="business_id",
                values="stars",
                fill_value=0)
            
            rating_standardized = rating_mat.fillna(rating_mat.mean(axis=0))

#             #********** new method ****************
#             #Building matrix by pivot table
#             rating_mat = pd.pivot_table(
#                 data=user_active_df,
#                 index="user_id",
#                 columns="business_id",
#                 values="stars",
#                 fill_value=0)
#             rating_standardized = rating_mat.apply(self.standardize)
            
            nrows = rating_standardized.shape[0]
            item_similarity = cosine_similarity(rating_standardized)
            np.fill_diagonal(item_similarity, 0)
            item_similarity_df = pd.DataFrame(item_similarity, index=rating_mat.index, columns=rating_mat.index)
            sim_user_30_m = self.find_n_neighbours(item_similarity_df, nrows,30)
            
            
            ###computing scores for each unseen business by user 
            business_user = Rating_avg.groupby(by = 'user_id')['business_id'].apply(lambda x:','.join(x))
            a = sim_user_30_m[sim_user_30_m.index==user_id].values
            b = a.squeeze().tolist()
            d = business_user[business_user.index.isin(b)]
            l = ','.join(d.values)
            businesses_seen_by_similar_users = l.split(',')
            businesses_under_consideration = list(set(businesses_seen_by_similar_users)-set(list(map(str, businesses_seen_by_user))))
            #businesses_under_consideration = list(map(int, businesses_under_consideration))
            score = []
            for item in businesses_under_consideration:
                c = rating_standardized.loc[:,item]
                d = c[c.index.isin(b)]
                f = d[d.notnull()]
                avg_user = Mean.loc[Mean['user_id'] == user_id,'mean'].values[0]
                index = f.index.values.squeeze().tolist()
                corr = item_similarity_df.loc[user_id,index]
                fin = pd.concat([f, corr], axis=1)
                fin.columns = ['adg_score','correlation']
                fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
                nume = fin['score'].sum()
                deno = fin['correlation'].sum()
                final_score = avg_user + (nume/deno)
                score.append(final_score)
            data = pd.DataFrame({'business_id':businesses_under_consideration,'score':score})
            top_recommendation = data.sort_values(by='score',ascending=False)
            ###noralize score between 0 and 1
            top_recommendation = top_recommendation.apply(lambda x: (x-min(x))/(max(x)-min(x)) if x.name == 'score' else x)
        return top_recommendation.head(topn)

In [106]:
#fs = FriendSim(ye=ye)

In [107]:
#test = fs.recommend(user_id="Op-cYCn71IJiIfx1IVjAwA",topn=20)

## 5.3 Class Engine CF

In [25]:
class EngineCF:

    def __init__(self, business_df, review_df, ye):
        logger.info("Initilizing Collaborative Filtering Engine")
        self.business_df = business_df
        self.ye = ye
        self.model = CustomSVD(review_df=review_df)
        #self.memory = FriendSim(ye=ye)

    def train(self, n_factor=20):
        self.model.fit(n_factor=n_factor)

    def predict(self, user_id, filters={}, model_w=0.4, memory_w=0.6, topn=20):
        already_reviewed, pred_model = self.model.transform(user_id=user_id, filters=filters, topn=1000)
        #pred_memory = self.memory.recommend(user_id=user_id, topn=1000)

        if not pred_model.empty:
            return already_reviewed, pred_model.head(topn)
        else:
            return pd.DataFrame(), pd.DataFrame()

    def save_model(self):
        with open(api_path + "models/customsvd.model", "wb") as f:
            pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_model(self):
        try:
            with open(api_path + "models/customsvd.model", "rb") as f:
                self.model = pickle.load(f)
                logger.info("Custom SVD model loaded")
        except (FileNotFoundError, IOError):
            logger.info("File not found")

## 5.4 Test

### 5.4.1 Test SVD

In [26]:
filters={}
filters["nearby"] = business_nearby

In [27]:
#train, test = train_test_split(user_active_df, test_size = 0.2)

In [28]:
engine_cf = EngineCF(business_df=business_df, review_df=review_df, ye=ye)

INFO:__main__:Initilizing Collaborative Filtering Engine


In [29]:
engine_cf.train()

INFO:__main__:Start building matrix
INFO:__main__:Matrix built in 155.28265261650085 seconds.


**Tested user**

-oD6KmwtQ-1CBCf4CoXCyQ

AyjqBovADgbskmLrIBOMlQ

In [30]:
friends, pred = engine_cf.predict(user_id="AyjqBovADgbskmLrIBOMlQ", filters=filters)

INFO:__main__:Start predicting
INFO:__main__:Prediction done in 0.30486178398132324 seconds.


In [31]:
friends

Unnamed: 0,score,user_id
0,1.0,AyjqBovADgbskmLrIBOMlQ
1,0.966345,Ka2VMEwLf_uZuRAxGSRZlg
2,0.948492,uAJNBvcJA2kIsGx_sSofEw
3,0.92594,byV70bKT87SHgN0uRtow6w
4,0.924949,UOTY05n5PmV7aARc7_UHIw
5,0.922442,22guc0RON4M9xJa0u0wdCQ
6,0.919292,rYPjqZO89ABei1fsZBSUbA
7,0.917314,Qa67LO6dGw1mKTy7R3YgwQ
8,0.915702,-VNv58eLhbQpz787rcD8VA
9,0.912978,-BevLRnJhh9JeIT2g5I3mQ


In [32]:
pred

Unnamed: 0,cf_score,business_id,distance,geo_score,score
0,2.079236,2iTsRqUsPGRH1li1WVRvKQ,0.7104,0.861674,1.835724
1,1.421459,0EgYXYjt2XJL4hlsKnzrcw,1.4074,0.721661,1.281499
2,1.373027,CoyeXg8FBsS_d20QzNIy-A,1.1497,0.773428,1.253107
3,1.352958,cyzOijFJ82cae6qH6L8vuw,2.8283,0.436231,1.169613
5,1.186255,v_yCY6NCwxCPIsNZR80UfQ,0.727,0.85834,1.120672
8,1.081263,zpoZ6WyQUYff18-z4ZU1mA,0.8193,0.839798,1.03297
4,1.240775,SAIrNOB4PtDA4gziNCucwg,4.667,0.066873,1.005994
7,1.120873,ahSFUPojs9X3-1jP-QPb-w,2.7635,0.449248,0.986548
6,1.133084,awI4hHMfa7H0Xf0-ChU5hg,3.5669,0.287861,0.964039
9,1.055605,KskYqH1Bi7Z_61pH6Om8pg,2.6899,0.464032,0.937291


In [93]:
already_reviewed, pred_model = engine_cf.model.transform(user_id="--2vR0DIsmQ6WfcSzKWigw")

INFO:__main__:Start predicting
INFO:__main__:This user --2vR0DIsmQ6WfcSzKWigw is not exist


In [83]:
pred_memory =  engine_cf.memory.recommend(user_id="--2vR0DIsmQ6WfcSzKWigw", topn=100)

INFO:elasticsearch:POST http://47.91.72.40:9200/yelp-user*/_search?_source_excludes=&_source_includes=name%2Cuser_id%2Cfriends%2Cyelping_since%2Creview_count%2Caverage_stars%2Celite&scroll=1m&size=2000 [status:200 request:0.015s]
INFO:elasticsearch:POST http://47.91.72.40:9200/_search/scroll?scroll=1m [status:200 request:0.013s]


Total reviews retrieved: 1


In [87]:
pred_memory = pred_memory.rename(columns={'score': 'score_memory'})

In [85]:
#engine_cf.predict(user_id="--2vR0DIsmQ6WfcSzKWigw")

In [None]:
# TODO Test --2vR0DIsmQ6WfcSzKWigw

In [27]:
engine_cf.model.cf_preds_df.max()

12.303789897181307

In [30]:
engine_cf.predict(user_id="AyjqBovADgbskmLrIBOMlQ", topn=50)

INFO:__main__:Start predicting
INFO:__main__:Prediction done in 0.15869402885437012 seconds.


(array(['-1m9o3vGRA8IBPNvNqKLmA', '-9dmhyBvepc08KPEHlEM0w',
        '-OMfcRwzGjiD1lpMIknQ1Q', '-VYYKJBVTX1BtaJU4eFJ8A',
        '-hu5mnjIjnTWyCQIGDL_Iw', '-qYuPncpK8elgD-6Qx3v7A',
        '-wCtRhzWJ40Z4F8mmg7kWg', '051OPr8FuLiRa6msetlVoQ',
        '065c76tt1dXBNmoGBymUgQ', '0CNebFRZI-YGffclhN0dXw',
        '0GP25wmLzoHxf_Et-0tcHw', '0_VT3sTwi7gorIlU36ASmg',
        '1APtA8NKOIVb4ECCg4tkpg', '1DHPkjDwHBguDZo_HR_ymw',
        '1LSdIQQVvzzyDyiYPxztkg', '1ZJz840L496LAmQdeDv0WA',
        '1bNuhgErDlA0qgOI6gWdvw', '1gU1ZNbcvk0puuUQjbl23w',
        '2g_auUWw7c3c2CUTgVqI8Q', '3GEEy7RP6e4bT4LAiWFMFQ',
        '3Jq5LfJ5fmJ5KmuA6VHmzw', '3Mx4renubPRnjHUw1n2UkA',
        '3Xm8JsJLiFdkLCp5PhX5ig', '3c6U5CdxsN_6Gpc6dqTylg',
        '3fPIRTnBhd81MixgS9YoMQ', '4k3RlMAMd46DZ_JyZU0lMg',
        '4v4-h0rAl5XZY1NCpHU30g', '5U2tTspAnSBBtaO-UDWcKg',
        '5qG4UHurI1yEozwn25WAFw', '6xY0TV43dtnGYfgwSNM-Tg',
        '75RP4HSsSJOe_e7e2e3jQQ', '7SJoMX5ti-VoEsEW6mJLRA',
        '7W-7QBBU67ax2nERXrDM1Q', '7hWNn

In [None]:
user_id = "EC5nxNCWCmjHg1F14WrlxQ"

In [None]:
to_ignore = engine_cf.already_review(user_id)

In [None]:
rec_cf = engine_cf.predict(user_id, items_to_ignore=to_ignore)

In [None]:
rec_cf = business_details(business_df, rec_cf)

In [None]:
rec_cf.head(5)

In [None]:
show_map(loc=simulation_loc, cf=rec_cf)

# 6. Hybrid

[Back to menu](#menu)

**Geolocation**

In [None]:
business_df_nearby.head(5)

**Popularity Based**

In [None]:
rec_pop[["business_id", "name", "categories", "score"]].head(5)

**Content Based**

In [None]:
rec_cb[["business_id", "name", "categories", "score"]].head(5)

**Collaborative Filtering**

In [None]:
rec_cf[["business_id", "name", "categories", "rec_score"]].head(5)

In [None]:
show_map(loc=simulation_loc, popularity=rec_pop, contentbased=rec_cb, cf=rec_cf)