# Libraries

**Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

**Others**

In [3]:
import re
import string

**Path**

In [4]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

# 1. Data PreProcessing

## 1.1 Load and clean data

In [5]:
df = pd.read_json(data_path + "cleaned/review_stars_Las Vegas.json", lines=True)

**Missing data**

In [6]:
df.isna().any()

business_id    False
name           False
state          False
city           False
categories      True
is_open        False
user_id        False
stars          False
text           False
date           False
dtype: bool

In [7]:
df = df.dropna(subset=["categories"])

In [8]:
df.shape

(285538, 10)

**Get relevant data for the recommendation**

In [9]:
relevant_feature = ["business_id", "user_id", "stars"]
recommender_df = df[relevant_feature]

In [10]:
review_count_df = recommender_df.groupby("user_id")["stars"].count()

**Filter active user whose has more than 10 reviews**

In [11]:
filter_active_user = []
for user in review_count_df[review_count_df >= 10].index:
    filter_active_user.append(user)

In [12]:
active_user_df = recommender_df[recommender_df["user_id"].isin(filter_active_user)]

**Take mean of multiple visits of one place for an user**

In [13]:
recommender_df = recommender_df.groupby(["user_id", "business_id"], as_index=False).mean()

**Create id of sparse matrix for business_id and user_id**

In [14]:
user_id = list(set(active_user_df['user_id']))
business_id = list(set(active_user_df['business_id']))

In [15]:
def create_user_id_matrix(row, user_id):
    return user_id.index(row["user_id"])

def create_business_id_matrix(row, business_id):
    return business_id.index(row["business_id"])

In [16]:
active_user_df["user_id_matrix"] = active_user_df.apply(lambda x: create_user_id_matrix(x, user_id), axis = 1)
active_user_df["business_id_matrix"] = active_user_df.apply(lambda x: create_business_id_matrix(x, business_id), axis = 1)

In [17]:
active_user_df

Unnamed: 0,business_id,user_id,stars,user_id_matrix,business_id_matrix
42,tstimHoMcYbkSC4eBA1wEg,HVaF7fObemxXN9vaC-XKYw,4,2243,80
48,tstimHoMcYbkSC4eBA1wEg,YIMeEHUYm69m-Mqf-NjPHw,5,2920,80
50,tstimHoMcYbkSC4eBA1wEg,xBMH3N0Fbua5pDtG1Y1mYQ,4,171,80
51,tstimHoMcYbkSC4eBA1wEg,W5iGpA5vgaWRyt4nY4cYBQ,3,1276,80
55,tstimHoMcYbkSC4eBA1wEg,djb61X-vkg5PF16qM_wI5Q,3,1175,80
...,...,...,...,...,...
285574,vIAEWbTJc657yN8I4z7whQ,EA-UgJmd-hQ8RBn1odmDJg,1,261,4176
285575,vIAEWbTJc657yN8I4z7whQ,93N6wLupUiu4k0bMjNBHqA,5,2815,4176
285576,vIAEWbTJc657yN8I4z7whQ,ZvneWq5RacQdzAdci1LkHQ,1,145,4176
285579,vIAEWbTJc657yN8I4z7whQ,EA-UgJmd-hQ8RBn1odmDJg,1,261,4176


## 1.2 Create sparse matrix from records

In [18]:
highest_user_id = len(active_user_df['user_id'].unique())
highest_business_id = len(active_user_df['business_id'].unique())
shape_matrix = (highest_user_id, highest_business_id)
ratings_mat = sparse.lil_matrix(shape_matrix)

In [19]:
for i, row in active_user_df.iterrows():
    ratings_mat[row["user_id_matrix"], row["business_id_matrix"]] = row["stars"]

In [20]:
ratings_mat

<3369x10395 sparse matrix of type '<class 'numpy.float64'>'
	with 63975 stored elements in List of Lists format>

# 2. Matrix Factorization

## 2.1 NMF

In [21]:
from sklearn.decomposition import NMF
class NMF_Recommender(object):

    def __init__(self, n_components):
        self.n_components = n_components

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        
        nmf = NMF(n_components = 200)
        nmf.fit(ratings_mat)
        
        #Features
        self.W = nmf.transform(ratings_mat)
        
        #Features Weight
        self.H = nmf.components_
        
        #Reconstructed matrix
        self.ratings_mat_fitted = self.W.dot(self.H)
        #self.error = nmf.reconstruction_err_
        
    def get_matrix_pred(self):
        return self.ratings_mat_fitted
    
    def get_matrix_rated(self):
        return self.rating_mat

In [22]:
nmf_rec = NMF_Recommender(n_components = 200)

In [23]:
nmf_rec.fit(ratings_mat)

In [24]:
pred_mat = nmf_rec.get_matrix_pred()

In [25]:
pred_mat.shape

(3369, 10395)

**Dump matrix into a file**

In [77]:
with open('./rec_vegas_matrix','wb') as f: pickle.dump(pred_mat, f)
with open('ori_vegas_matrix')

## 2.2 SVD Truncated

In [None]:
from sklearn.decomposition import TruncatedSVD
class SVD_Recommender(object):

    def __init__(self, n_components):
        self.n_components = n_components

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        
        #the size of the single value is n_components
        svd = TruncatedSVD(n_components=self.n_components, n_iter=7, random_state=1)
        svd.fit(ratings_mat)
        self.V = svd.components_
        self.U = svd.transform(ratings_mat)
        self.ratings_mat_fitted = self.U.dot(self.V)
        
    def get_matrix_pred(self):
        return self.ratings_mat_fitted
    
    def get_matrix_rated(self):
        return self.rating_mat