In [1]:
import pickle
import pandas as pd
import numpy as np

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


In [2]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [ (i, j, self.R[i, j]) for i in range(self.num_users) for j in range(self.num_items) if self.R[i, j] > 0]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [3]:
df=pd.DataFrame()
df2=pd.DataFrame()
df3=pd.DataFrame()
count=0
for x in pd.read_csv('review.csv',chunksize=2000, low_memory=False):
    x.columns=['funny','user_id','review_id','text','business_id','stars','date','useful','cool']
    print(5)
    x=x.drop(['funny','useful','cool','review_id','text','date'], axis=1)
    print(9)
    if count < 8:
        print(0)
        df = pd.concat([df,x])
        print(1)
        break
    elif 8 < count < 16 :
        print(1.2)
        df2 = pd.concat([df2,x])
        print(2)
    else:
        print(3)
        df3 = pd.concat([df3,x])
        print(4)
    count+=1
print(5)
df = pd.concat([df,df2,df3])

5
9
0
1
5


In [4]:
df.head()

Unnamed: 0,user_id,business_id,stars
0,bv2nCi5Qv5vroFiqKGopiw,0W4lkclzZThpx3V65bVgig,5
1,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5
2,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5
3,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5
4,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4


In [5]:
df=df.pivot(index = 'user_id', columns ='business_id', values = 'stars')

In [6]:
R=np.array(df.fillna(0))

In [7]:
mf = MF(R, K=50, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 20 ; error = 48.8265
Iteration: 40 ; error = 47.4237
Iteration: 60 ; error = 46.1455
Iteration: 80 ; error = 44.9283
Iteration: 100 ; error = 43.7438
()
P x Q:
[[3.09462631 3.32423192 3.23792985 ... 3.33165006 3.20578262 3.08249823]
 [3.31957218 3.54927212 3.45839381 ... 3.55032356 3.42648539 3.29896622]
 [3.67487406 3.90658759 3.81417358 ... 3.91392793 3.79012578 3.66543344]
 ...
 [3.76049391 3.9873428  3.90101712 ... 3.99513977 3.86699642 3.74669743]
 [3.27243911 3.50122006 3.41621352 ... 3.50750127 3.38036902 3.25727502]
 [3.67964379 3.90698558 3.82180225 ... 3.91165156 3.78645289 3.66056673]]
()


In [8]:
data_fr=pd.DataFrame(mf.full_matrix(), index=df.index, columns=df.columns)
fra=data_fr.T
fra.to_pickle("recommend_pickle.pkl")

In [9]:
a=fra.columns[400:410]
print(a)

Index([], dtype='object', name=u'user_id')


In [10]:
x=fra['NwxF1Ftc-4P5Pl_SSXThQA']
y= x.nlargest(10).index
print(y)
    

KeyError: 'NwxF1Ftc-4P5Pl_SSXThQA'