### Importing the required libraries

In [1]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import pickle

### Loading the chunk of dataset we had loaded in EDA notebook

In [2]:
data = pd.read_csv("datasets/dataset.csv", encoding = "ISO-8859–1")
data.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,-MhfebM0QIsKt87iDN-FNw,0,2015-04-15 05:21:16,0,xQY8N_XvtGbearJ5X4QryQ,2,"As someone who has worked with many museums, I...",5,OwjRMXRC0KyPrIlcjaXeFQ
1,lbrU8StCq3yDfr-QMnGrmQ,0,2013-12-07 03:16:52,1,UmFMZ8PyXZTY2QcwzsfQYA,1,I am actually horrified this place is still in...,1,nIJD_7ZXHq-FX8byPMOkMQ
2,HQl28KMwrEKHqhFrrDqVNQ,0,2015-12-05 03:18:11,0,LG2ZaYiOgpr2DK_90pYjNw,5,I love Deagan's. I do. I really do. The atmosp...,1,V34qejxNsCbcgD8C0HVk-Q
3,5JxlZaqCnk1MnbgRirs40Q,0,2011-05-27 05:30:52,0,i6g_oA9Yf9Y31qt0wibXpw,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0,ofKDkJKXSKZXu5xJNGiiBQ
4,IS4cv902ykd8wj1TR0N3-A,0,2017-01-14 21:56:57,0,6TdNDKywdbjoTkizeMce8A,4,"Oh happy day, finally have a Canes near my cas...",0,UgMW8bLE0QMJDCkQ1Ax5Mg


### Data Cleaning and Imputations

### In this section we are doing following operations to clean the data and convert to a proper format
#### 1. We are converting all the characters into lower case
#### 2. We are removing the stop words beacuse stop words will not capture any contextual information from the corpus
#### 3. Replacing some short-forms to thier proper representation.
#### 4. below is the set of stop words from english language

In [3]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/ajay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_reviews(text):
    text = text.translate(string.punctuation)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

### Only taking the set of attributes we are interested in.
### Latar on I will have to use sparse matrix size of 10K * 10K as per input. This will give memory error in the local machine so I am taking a set of 1K for traning and 500 for testing. 

In [3]:
test_set = data[1000:1500]
data = data[['business_id', 'user_id', 'stars', 'text']][:1000]

In [6]:
list(test_set['user_id'].unique())[:100]

['-koQoVsEGgN03TAgQd_DWA',
 'bHXujstlLp-QuNr72Meprw',
 'VQrHL8gzDlFz0jGbrXq3yw',
 'jMgQQwSLjyqSvckG_DvZIw',
 '82ksBgi3GcayHuPE87zD-w',
 '_5hoVyqm1ghfXVymvZjzKQ',
 'xUhKi3p2BRTEbTHtJz-Hhg',
 'M6-A6F0B3kM5i94Kr0XHcw',
 'YLyFQ9VVBoqqPIiUeDvVZw',
 '2NkVhK4-yLprEBeYqqn3xw',
 '4r2M6cFPugL7cJmc_6VYGA',
 'BMm9-bo1Xar-Vm8VMnQ88g',
 'jmyunODJvYT7n7LCgotAyQ',
 'SbsUVsP2gkQhJf4L2q4kjg',
 'H6j_KNWrrcRYYndQgAUizw',
 'Ibl1msXxcsHrzBp8oxTr0g',
 'Q4-IBfwUPk3uitiaho_dqw',
 'Fds6mttIFKPsSmY8xfC_XA',
 '9-SlQK2lwcXVzk3tJU4x7g',
 'M3ncFIlEfaSdSpoiMINwBA',
 'ZBlSML8YMmfcxNo-zdH1_A',
 'uBSMteq_cq9iT1MMGmXtvQ',
 'zlQy2mGCbiYzhcbgJ60rig',
 'X4cwQL_JZZnAUyCbOwz3pw',
 '9Jpq9Rtg6xx3nisN-FgGFg',
 '2xfTH1pK3gPtpEh4Gz_Y3w',
 'SehV0pAEJNVJvd5_5rtTyg',
 'EBwGWhbi49i2DBF61hVybg',
 'yOM891kAdwni5ecnGsQcWw',
 'dEDId4Rp2JpblIfYIoH00w',
 'o8gCgAuBdy7OdmU7jxjW1g',
 'DzQqwNMb1v5fZtW6urXd1w',
 'g-_TsZNEJ_hls717NuGekw',
 'M1mSvucWF3V1h0L98okb9g',
 'Dln9H9qc9EbKaC-c0FCFFA',
 'xftnxppMN_gUt5oknMu4dA',
 'xSiaSnRfXBVZstO9LBNRaA',
 

In [6]:
data['text'] = data['text'].apply(clean_reviews)

### Since we are going to implement Collaborative Filtering method, we need to split attributes into user and restaurent dataframes

In [7]:
user_df = data[['user_id','text']]
rest_df = data[['business_id', 'text']]

### From EDA notebook we came to know that there is one to many mapping with users and reviews therefore we will aggregate reviews for each users and same for restaurents 

In [8]:
user_df = user_df.groupby('user_id').agg({'text': ' '.join})
rest_df = rest_df.groupby('business_id').agg({'text': ' '.join})

In [9]:
user_df.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
-1eORDLXTqztOsdmrEJ_Pw,great service ! ! friendly staff ! vegan selec...
-2gOxVWcnBr5DclrrsWXCA,excellent service usually peracriptions hand w...
-9vc7n5Qrc0Wp7_NGufj3w,fan ! several times never suprise rooms top no...
-Co-ReNx_lXT1xL_Rr0B2g,previous person posted really said restaurant ...
-Cwg2o01k7InVnljQmncQA,beware sushi not rice extremely dry something ...


### We need to encode the reviews into feature format so that we can go ahead with modelling. For this task we are using TF-IDF vectorizer to extract the features from the text

In [10]:
user_feature_object = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=500)
user_feature = user_feature_object.fit_transform(user_df['text'])
user_feature.shape

(994, 500)

In [11]:
rest_feature_object = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=500)
rest_feature = rest_feature_object.fit_transform(rest_df['text'])
rest_feature.shape

(810, 500)

### Since we are going to use Matrix factorization method for CF, we need to transform into matrix fromat 

In [12]:
user_vect = user_feature.toarray()
P = pd.DataFrame(user_vect, index=user_df.index, columns=user_feature_object.get_feature_names())
rest_vect = rest_feature.toarray()
Q = pd.DataFrame(rest_vect, index=rest_df.index, columns=rest_feature_object.get_feature_names())

In [13]:
P.head()

Unnamed: 0_level_0,!,+,-,1,10,2,3,30,5,:,...,working,worth,would,wrong,year,years,yelp,yes,you,yummy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1eORDLXTqztOsdmrEJ_Pw,0.554708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.094624,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2gOxVWcnBr5DclrrsWXCA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9vc7n5Qrc0Wp7_NGufj3w,0.540844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Co-ReNx_lXT1xL_Rr0B2g,0.0,0.196141,0.043433,0.0,0.0,0.0,0.0,0.0,0.0,0.109029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066148,0.080636
-Cwg2o01k7InVnljQmncQA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Now moving on to the target matrix in which each cell will identify the rating for user and corresponding restaurent

In [14]:
target_matrix = pd.pivot_table(data, values='stars', index=['user_id'], columns=['business_id'])
target_matrix.shape

(994, 810)

### We will have to decompose each cell value into product of user and restaurent feature vector. So we will be dealing with cells having some value (ignoring NaNs)

In [15]:
target_matrix = target_matrix.fillna(0.0)

### Now our problem is a kind of optimization problem so we will use gradient descent to solve this convergence of optimation problem.
### As time and space permits we can increase the step size to 100+ and error rate <0.001 for better optimization. Here I am using less step size and high error rate just to check the performance.

In [19]:
def matrix_factorization_method(R, P , Q, steps, gamma, lamda):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        error=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    error= error + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if error<0.01:
            break
        
    return P,Q

In [20]:
%%time
P, Q = matrix_factorization_method(target_matrix, P, Q, 20, 0.01, 0.1)

CPU times: user 4min 35s, sys: 163 ms, total: 4min 35s
Wall time: 4min 35s


### Now it is time to make prediction given a user id 

In [41]:
input_user_id = 'bHXujstlLp-QuNr72Meprw'

### Cleaning the test input data to get the expected format as an input 

In [42]:
inp_text = pd.DataFrame([test_set[test_set['user_id']==input_user_id]['text'].values[0]], columns = ['text'])
inp_text['text'] = inp_text['text'].apply(clean_reviews)
test_feature = user_feature_object.transform(inp_text['text'])
test_P = pd.DataFrame(test_feature.toarray(), index=inp_text.index, 
                         columns=user_feature_object.get_feature_names())

In [43]:
test_P

Unnamed: 0,!,+,-,1,10,2,3,30,5,:,...,working,worth,would,wrong,year,years,yelp,yes,you,yummy
0,0.144335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Making the prediction based on updated Q matrix

In [44]:
predict=pd.DataFrame(np.dot(test_P.loc[0],Q.T),index=Q.index,columns=['Ratings'])
recomd =pd.DataFrame.sort_values(predict,['Ratings'],ascending=[0])[:5]
recomd

Unnamed: 0_level_0,Ratings
business_id,Unnamed: 1_level_1
MjOk1rCc0puNfBYWdm2Ocw,0.80928
GX9W1U-wsZPqWTgs1_-wRA,0.57296
wAUUgvSJqKdx6x7Lzy79Og,0.56848
y0pTeRLBftD__abekOFj6g,0.557631
VNpQlfOaX4_vEaYcPC1fJg,0.542337


### Saving the updated Q matrix for future predictions on test ids

In [34]:
Q.to_pickle("models/Q_matrix.pkl")

In [46]:
import pickle
filehandler = open("models/tf-idf.obj","wb")
pickle.dump(user_feature_object,filehandler)

In [47]:
filehandler = open("models/tf-idf.obj","r")
user_ft = pickle.load(filehandler)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte