In [3]:
import findspark
findspark.init()

In [4]:
from __future__ import print_function

import sys

import numpy as np
from numpy.random import rand
from numpy import matrix
from pyspark.sql import SparkSession

In [5]:
import pandas as pd

In [None]:
def loadDatasets():
    print('Loading datasets')
    movies = pd.read_csv('../data/movie.csv')
    ratings = pd.read_csv('../data/rating.csv')
    print('Datasets loaded successfully')
    return movies, ratings

In [None]:
def preprocessData(user_movie_rating, factor=0.25):
        size = user_movie_rating.shape[0]
        partition_index = int(size*factor)
        subset = user_movie_rating.iloc[:partition_index, :]
        subset = subset.dropna()
        subset = subset.drop('timestamp', axis=1)
        return subset

In [None]:
def getCharateristicMatrix(user_movie_rating):
        characteristic_df = user_movie_rating.pivot('userId', 'movieId', values='rating')
        characteristic_df = characteristic_df.fillna(0)
        characteristic_matrix = characteristic_df.as_matrix()
        characteristic_df.index.name = None
        
        movie_mapping_df = pd.DataFrame({'matrix_index': range(characteristic_df.shape[1]), 'movie_id': characteristic_df.columns})
        movie_mapping = dict(zip(movie_mapping_df.matrix_index, movie_mapping_df.movie_id))
        r_movie_mapping = dict(zip(movie_mapping_df.movie_id, movie_mapping_df.matrix_index))
        
        user_mapping_df = pd.DataFrame({'matrix_index': range(characteristic_df.shape[0]), 'user_id': characteristic_df.index.tolist()})
        user_mapping = dict(zip(user_mapping_df.matrix_index, user_mapping_df.user_id))
        r_user_mapping = dict(zip(user_mapping_df.user_id, user_mapping_df.matrix_index))
        mappings = {
            'user_mapping': user_mapping,
            'movie_mapping': movie_mapping,
            'r_user_mapping': r_user_mapping,
            'r_movie_mapping': r_movie_mapping
        }
        return characteristic_matrix, mappings

In [None]:
movies, ratings = loadDatasets()

In [None]:
ratings_subset = ratings.sample(frac=0.0005)

In [None]:
ratings_subset = preprocessData(ratings_subset, factor=1)

In [None]:
n_characteristic_matrix, mappings = getCharateristicMatrix(ratings_subset)
# train_matrix, test_matrix = self.train_test_split(characteristic_matrix)

In [None]:
spark = SparkSession\
        .builder\
        .appName("MovieRecommender")\
        .getOrCreate()


sc = spark.sparkContext

In [None]:
sc._conf.getAll()

In [None]:
n_characteristic_matrix = np.mat(n_characteristic_matrix.T)

In [None]:
n_characteristic_matrix.shape

In [None]:
M = n_characteristic_matrix.shape[0]
U = n_characteristic_matrix.shape[1]
F = 20

ITERATIONS = 5
partitions = 2

m_offset = 1500
u_offset = 10000

R = n_characteristic_matrix

ms = matrix(rand(M, F))
us = matrix(rand(U, F))


In [None]:
global_bias = ratings_subset['rating'].mean()
user_bias = np.zeros(U)
movie_bias = np.zeros(M)

In [8]:
movies = 10
users = 30
F = 3

In [7]:
R = matrix(rand(movies, users))

In [10]:
ms = matrix(rand(movies, F))

In [11]:
us = matrix(rand(users, F))

In [14]:
user_bias = np.zeros(users)

In [15]:
print(user_bias.shape)
user_bias

(30,)


array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.])

In [12]:
Xty = us.T * R[1, :].T

In [18]:
R[1, :].T - user_bias.reshape((user_bias.shape[0], 1)).T

matrix([[ 0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101,
          0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101,
          0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101,
          0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101,
          0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101,
          0.84360101,  0.84360101,  0.84360101,  0.84360101,  0.84360101],
        [ 0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,
          0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,
          0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,
          0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,
          0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,
          0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ,  0.564675  ],
        [ 0.29085432,  0.29085432,  0.29085432,  0.29085432,  0.29085432,
          0.29085432,  0.29085432,  

In [None]:
Xty = user_mat.T * (movie_rows[i%m_offset, :].T - user_bias)

In [None]:
b_movies = []
b_users = []

index = 0
while index*m_offset < M:
    b_movies.append(sc.broadcast(R[m_offset*index:(index+1)*m_offset]))
    index += 1

index = 0
while index*u_offset < U:
    b_users.append(sc.broadcast(R[:, u_offset*index:(index+1)*u_offset]))
    index += 1

In [None]:
msb = sc.broadcast(ms)
usb = sc.broadcast(us)
b_user_bias = sc.broadcast(user_bias)
b_movie_bias = sc.broadcast(movie_bias)

In [None]:
LAMBDA = 0.01   # regularization
np.random.seed(42)


def rmse(R, ms, us):
    diff_count = R - ms * us.T
    return np.sqrt(np.sum(np.power(diff, 2)) / (M * U))

def updateMovie(i, mat, movie_rows, m_offset):
    uu = mat.shape[0]
    ff = mat.shape[1]

    XtX = mat.T * mat
    Xty = mat.T * movie_rows[i%m_offset, :].T

    for j in range(ff):
        XtX[j, j] += LAMBDA * uu

    return np.linalg.solve(XtX, Xty)



def updateUser(i, mat, user_cols, u_offset):
    uu = mat.shape[0]
    ff = mat.shape[1]

    XtX = mat.T * mat
    Xty = mat.T * user_cols[i%u_offset, :].T

    for j in range(ff):
        XtX[j, j] += LAMBDA * uu

    return np.linalg.solve(XtX, Xty)


def updateMovie2(i, user_mat, movie_rows, m_offset, user_bias):
    user_count = user_mat.shape[0]
    
    ones = np.ones((user_count, 1))
    
    user_mat = np.hstack((ones, user_mat))
    
    latent_count = user_mat.shape[1]

    XtX = user_mat.T * user_mat
    Xty = user_mat.T * (movie_rows[i%m_offset, :].T - user_bias)
    
#     print('Update Movie2 :')
#     print("XtX : ", XtX.shape)
#     print("Xty : ", Xty.shape)

    for j in range(latent_count):
        XtX[j, j] += LAMBDA

    return np.linalg.solve(XtX, Xty)



def updateUser2(i, movie_mat, user_cols, u_offset, movie_bias):
    movie_count = movie_mat.shape[0]
    
    ones = np.ones((user_count, 1))
    
    movie_mat = np.hstack((ones, movie_mat))
    
    latent_count = movie_mat.shape[1]
    
    print("Starting user")

    XtX = movie_mat.T * movie_mat
    Xty = movie_mat.T * (user_cols[i%u_offset, :].T - movie_bias)
    
    print('Update User :')
    print("XtX : ", XtX.shape)
    print("Xty : ", Xty.shape)

    for j in range(latent_count):
        XtX[j, j] += LAMBDA

    return np.linalg.solve(XtX, Xty)


In [None]:
for i in range(ITERATIONS):
    ms = sc.parallelize(range(M), partitions) \
           .map(lambda x: updateMovie2(x, usb.value, b_movies[x/m_offset].value, m_offset, b_user_bias.value)) \
           .collect()
    ms = matrix(np.array(ms)[:, :, 0])

    movie_bias = ms[:, 0]
    ms = ms[:, 1:]

    msb = sc.broadcast(ms)
    b_movie_bias = sc.broadcast(movie_bias)

    print('Done with movies')

    us = sc.parallelize(range(U), partitions) \
           .map(lambda x: updateUser2(x, msb.value, b_users[x/u_offset].value.T, u_offset, b_movie_bias.value)) \
           .collect()
    us = matrix(np.array(us)[:, :, 0])

    user_bias = us[:, 0]
    us = us[:, 1:]

    usb = sc.broadcast(us)
    b_user_bias = sc.broadcast(user_bias)

    error = rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f\n" % error)

In [None]:
import pickle

In [None]:
with open('ms.bin', mode='wb') as model_binary:
    pickle.dump(ms, model_binary)

In [None]:
with open('us.bin', mode='wb') as model_binary:
    pickle.dump(us, model_binary)

In [None]:
with open('mappings.bin', mode='wb') as model_binary:
    pickle.dump(mappings, model_binary)

In [None]:
class BestModel(object):
    def __init__(self, movies, model, ratings, mappings):
        self.ratings = ratings
        self.model = model
        self.mappings = mappings
        self.movies = movies

In [None]:
from os import path
import os

In [None]:
path.dirname( path.dirname( path.abspath('/Users/amit/WorkPro/warzone/movie-recommender/model') ))

In [None]:
import sys
sys.path.append('/Users/amit/WorkPro/warzone/movie-recommender/')

In [None]:
from model import BestModel

In [None]:
class AlternatingLeastSquare(object):
    USERS_MATRIX = 'users_matrix'
    MOVIES_MATRIX = 'movies_matrix'
    
    def __init__(self, user_matrix, movies_matrix):
        np.random.seed(0)
        self.users_matrix = user_matrix
        self.movies_matrix = movies_matrix

    def predict(self, user, movie):
        return self.users_matrix[user, :].dot(self.movies_matrix[movie, :].T)
    
    def predict_all(self):
        # predictions = np.zeros((self.total_users, self.total_movies))
        # for user in xrange(self.total_users):
        #     for movie in xrange(self.total_movies):
        #         predictions[user, movie] = self.predict(user, movie)

        predictions = self.users_matrix.dot(self.movies_matrix.T)
                
        return predictions

In [None]:
from model import AlternatingLeastSquare

In [None]:
als = AlternatingLeastSquare(np.array([[]]))

In [None]:
als.users_matrix = np.array(us)
als.movies_matrix = np.array(ms)

In [None]:
als = AlternatingLeastSquare(np.array(us), np.array(ms))

In [None]:
best_model = BestModel(movies, als, ratings_subset, mappings)

In [None]:
with open('../data/best_model.bin', mode='wb') as model_binary:
    pickle.dump(best_model, model_binary)

In [None]:
movies.head()

In [None]:
cat = 'Animation'

In [None]:
movies[movies['genres'].str.contains(cat)][::-1][:20]

In [None]:
ids = movies[movies['genres'].str.contains(cat)][::-1]['movieId']

In [None]:
ids.values

In [None]:
recom_ids = [593, 296, 527, 1196, 2959, 50, 1198, 480, 2762, 3578, 2858, 47, 858, 457, 1270, 1, 150, 4306, 4226, 6539, 1240, 1291, 59315, 1704, 32, 60069, 1580, 6377, 590, 44191]


In [None]:
# ratings[ratings['movieId'].isin(ids.values)].groupby('userId')
ratings_subset[ratings_subset['movieId'].isin(recom_ids)].groupby(by='movieId')['rating'].agg(['sum','count']).reset_index().sort_values('sum', ascending=False)


In [None]:
watched_ids = [73881, 6776, 76093, 79132, 36086, 33794, 27518, 91241, 91844, 71688, 94969, 73051, 97938, 98198, 98956, 99636, 105343, 106561, 82097, 107707, 109093, 79769, 109096, 117506, 74787, 73900, 73513, 94661, 110, 69136]


In [None]:
ratings_subset[ratings_subset['movieId'].isin(watched_ids)].groupby(by='movieId')['rating'].agg(['sum','count']).reset_index().sort_values('sum', ascending=False)


In [None]:
movies[movies['title'].str.contains('Raajneeti')]

In [None]:
hindi_users = ratings_subset[ratings_subset['movieId'] == 79769].userId.values

In [None]:
hindi_users

In [None]:
ratings_subset[ratings_subset['userId'].isin(hindi_users)].groupby(by='movieId')

In [None]:
hindi_users = ratings_subset[ratings_subset['movieId'].isin(watched_ids)].groupby(by='userId')['rating'].agg(['sum','count']).reset_index().sort_values('userId')


In [None]:
hindi_users[hindi_users['userId'] < 50000]

In [None]:
movies[movies['title'].str.contains('Deewaar')]

In [None]:
ratings[ratings['userId'] == 79570].shape

In [None]:
ans = set()
for x in movies['genres'].unique():
    x = x.split('|')
    for y in x:
        ans.add(y)

In [None]:
ans = list(ans)

In [None]:
movies.columns

In [None]:
movies[movies['movieId'] == 69069]

In [None]:
def fu(row, genre):
    if genre in row['genres']:
        return 1
    return 0

In [None]:
movies['Comedy'] = movies.apply(lambda x: fu(x, 'Comedy'), axis=1)

In [None]:
int(True)

In [None]:
movies.head()

In [None]:
print(ans)

In [None]:
for gen in ans:
    movies[gen] = 0

In [None]:
ratings_subset[ratings_subset['userId'] == 353].sort_values('rating', ascending=False)[:30]['movieId'].values

In [None]:
years = map(str, range(2015, 2020))

In [None]:
years

In [None]:
movies[movies['title'].str.contains('|'.join(years)) & ~movies['title'].str.startswith('2')][:20]

In [None]:
groups = ratings_subset.groupby(by='movieId')['rating'].agg(['sum','count'])

In [None]:
groups = groups.reset_index()

In [None]:
groups['averageRating'] = groups['sum']/(1.0*groups['count'])

In [None]:
groups.head()

In [None]:
C = groups['averageRating'].mean()

In [None]:
m = groups['count'].quantile(0.90)

In [None]:
q_movies = groups.copy().loc[groups['count'] >= m]

In [None]:
q_movies.shape

In [None]:
def bayesian_average(x, m=m, C=C):
    v = x['count']
    R = x['averageRating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_movies['score'] = q_movies.apply(bayesian_average, axis=1)

In [None]:
q_movies = q_movies.sort_values('score', ascending=False)

In [None]:
q_movies[:10]['movieId'].values

In [None]:
movies[movies['movieId'].isin(q_movies['movieId'].values)].head(10)

In [None]:
pd.merge(movies, q_movies, how='inner',left_on="movieId" , right_on='movieId').sort_values('score', ascending=False)

In [None]:
import numpy as np

class Recommender(object):
    def __init__(self, model_obj):
        np.random.seed(0)
        self.model_obj = model_obj
        self.model = self.model_obj.model
        self.mappings = self.model_obj.mappings
        self.ratings = self.model_obj.ratings
        self.movies = self.model_obj.movies

    def getUserIdsFromMatrixIndexes(self, matrix_indexes, user_mapping, preserve_order=True):
        user_ids = []
        for index in matrix_indexes:
            user_ids.append(user_mapping[index])
        return user_ids

    def getMovieIdsFromMatrixIndexes(self, matrix_indexes, preserve_order=True):
        movie_ids = []
        movie_mapping = self.mappings['movie_mapping']
        for index in matrix_indexes:
            movie_ids.append(movie_mapping[index])
        return movie_ids

    def getWatchedMovies(self, user_id):
        rated_movies = self.ratings[self.ratings['userId'] == user_id]['movieId'].values.tolist()
        return rated_movies

    def recommendMoviesTo(self, user_id, limit=50):
        user_index = self.mappings['r_user_mapping'][user_id]
        user_vector = self.model.users_matrix[user_index, :]
        user_ratings = user_vector.dot(self.model.movies_matrix.T)
        highest_rated_movie_indexes = user_ratings.argsort()[::-1][:limit]
        movie_ids = self.getMovieIdsFromMatrixIndexes(highest_rated_movie_indexes)
        return movie_ids

    def displayMovies(self, movie_ids):
        movie_infos = []
        for movie_id in movie_ids:
            movie_info = self.movies[self.movies['movieId'] == movie_id].values.tolist()[0]
            movie_infos.append(movie_info)
        return movie_infos

    def filterWatchedMovies(self, user_id, ordered_movie_ids):
        all_movies = set(ordered_movie_ids)
        rated_movies = set(self.getWatchedMovies(user_id))
        not_watched = all_movies - rated_movies
        ordered_not_watched = [movie_id for movie_id in ordered_movie_ids if movie_id not in rated_movies]
        return ordered_not_watched

# if __name__ == '__main__':
#     recommender = Recommender()
#     movie_ids = recommender.recommendMoviesTo(1, limit=300)
#     print movie_ids
#     print recommender.displayMovies(movie_ids)




In [None]:
recom = Recommender(best_model)

In [None]:
ids = recom.recommendMoviesTo(1, limit=300)

In [None]:
recom.displayMovies(ids)

In [None]:

# offset = 11652

# m_offset = 10000

# Rb1 = sc.broadcast(R[0:offset])
# Rb2 = sc.broadcast(R[offset:2*offset])

# Rbm1 = sc.broadcast(R[:, 0:m_offset])
# Rbm2 = sc.broadcast(R[:, m_offset:2*m_offset])

In [None]:
type(ms)

In [None]:
arr = np.array(ms)

In [None]:
arr.shape

In [None]:
spark = SparkSession\
        .builder\
        .appName("MovieRecommender")\
        .getOrCreate()


sc = spark.sparkContext

In [None]:
ratings = sc.textFile('../data/rating.csv')
ratings = ratings.map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
ratings_df = ratings.toDF(['userId', 'movieId', 'rating'])

In [None]:
ratings_df = ratings_df.drop('timestamp')

In [None]:
ratings_df.show()

In [None]:
d = ratings_df.groupBy("userId").pivot("movieId").avg("rating")

In [None]:
spark.conf.set('spark.sql.pivotMaxValues', u'50000')

In [None]:
d = d.fillna(0)

In [None]:
data_array =  np.array(d.collect())

In [None]:
data_array.shape

In [None]:
d.columns

In [None]:
data_array[0:3, 1:]

In [None]:
d.show(n=1)

In [None]:
type(d)

In [None]:
dir(d)

In [None]:
df = pd.read_csv('../data/subset.csv', header = None, names = ['userId', 'movieId', 'rating', 'timestamp'])

In [None]:
small_ratings_raw_data = sc.textFile('../data/ratings.csv')
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]


In [None]:
small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [None]:
small_ratings_data.take(3)

In [None]:
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [None]:
from pyspark.mllib.recommendation import ALS
import math

seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

model = ALS.train(training_RDD, 12, seed=seed, iterations=iterations,
                  lambda_=regularization_parameter)

predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
errors[err] = error
err += 1
print ('For rank %s the RMSE is %s' % (12, error))