In [24]:
import pandas as pd
import numpy as np
import pickle

In [38]:
ratings = pd.read_csv("ratings12.csv", encoding='"ISO-8859-1"')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,3,1,4.0,944919407
1,6,1,5.0,858275452
2,8,1,4.0,833981871
3,10,1,4.0,943497887
4,11,1,4.5,1230858821


In [10]:
# split the ratings into training and test
ratings_training = ratings.sample(frac=0.7)
ratings_test = ratings.drop(ratings_training.index)

In [11]:
# calculate adjusted ratings based on training data
rating_mean= ratings_training.groupby(['movieId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['movieId','rating_mean']]
adjusted_ratings = pd.merge(ratings_training,rating_mean,on = 'movieId', how = 'left', sort = False)
adjusted_ratings['rating_adjusted']=adjusted_ratings['rating']-adjusted_ratings['rating_mean']
# replace 0 adjusted rating values to 1*e-8 in order to avoid 0 denominator
adjusted_ratings.loc[adjusted_ratings['rating_adjusted'] == 0, 'rating_adjusted'] = 1e-8


In [21]:
# function of building the item-to-item weight matrix
def build_w_matrix(adjusted_ratings, load_existing_w_matrix):
   # define weight matrix
   w_matrix_columns = ['movie_1', 'movie_2', 'weight']
   w_matrix=pd.DataFrame(columns=w_matrix_columns)

   # load weight matrix from pickle file
   if load_existing_w_matrix:
       with open(DEFAULT_PARTICLE_PATH, 'rb') as input:
           w_matrix = pickle.load(input)
       input.close()

   # calculate the similarity values
   else:
       distinct_movies = np.unique(adjusted_ratings['movieId'])

       i = 0
       # for each movie_1 in all movies
       for movie_1 in distinct_movies:

           if i%10==0:
               print(i , "out of ", len(distinct_movies))

           # extract all users who rated movie_1
           user_data = adjusted_ratings[adjusted_ratings['movieId'] == movie_1]
           distinct_users = np.unique(user_data['userId'])

           # record the ratings for users who rated both movie_1 and movie_2
           record_row_columns = ['userId', 'movie_1', 'movie_2', 'rating_adjusted_1', 'rating_adjusted_2']
           record_movie_1_2 = pd.DataFrame(columns=record_row_columns)
           # for each customer C who rated movie_1
           for c_userid in distinct_users:
               print('build weight matrix for customer %d, movie_1 %d' % (c_userid, movie_1))
               # the customer's rating for movie_1
               c_movie_1_rating = user_data[user_data['userId'] == c_userid]['rating_adjusted'].iloc[0]
               # extract movies rated by the customer excluding movie_1
               c_user_data = adjusted_ratings[(adjusted_ratings['userId'] == c_userid) & (adjusted_ratings['movieId'] != movie_1)]
               c_distinct_movies = np.unique(c_user_data['movieId'])

               # for each movie rated by customer C as movie=2
               for movie_2 in c_distinct_movies:
                   # the customer's rating for movie_2
                   c_movie_2_rating = c_user_data[c_user_data['movieId'] == movie_2]['rating_adjusted'].iloc[0]
                   record_row = pd.Series([c_userid, movie_1, movie_2, c_movie_1_rating, c_movie_2_rating], index=record_row_columns)
                   record_movie_1_2 = record_movie_1_2.append(record_row, ignore_index=True)

           # calculate the similarity values between movie_1 and the above recorded movies
           distinct_movie_2 = np.unique(record_movie_1_2['movie_2'])
           # for each movie 2
           for movie_2 in distinct_movie_2:
               print('calculate weight movie_1 %d, movie_2 %d' % (movie_1, movie_2))
               paired_movie_1_2 = record_movie_1_2[record_movie_1_2['movie_2'] == movie_2]
               sim_value_numerator = (paired_movie_1_2['rating_adjusted_1'] * paired_movie_1_2['rating_adjusted_2']).sum()
               sim_value_denominator = np.sqrt(np.square(paired_movie_1_2['rating_adjusted_1']).sum()) * np.sqrt(np.square(paired_movie_1_2['rating_adjusted_2']).sum())
               sim_value_denominator = sim_value_denominator if sim_value_denominator != 0 else 1e-8
               sim_value = sim_value_numerator / sim_value_denominator
               w_matrix = w_matrix.append(pd.Series([movie_1, movie_2, sim_value], index=w_matrix_columns), ignore_index=True)

           i = i + 1

       # output weight matrix to pickle file
       with open(DEFAULT_PARTICLE_PATH, 'wb') as output:
           pickle.dump(w_matrix, output, pickle.HIGHEST_PROTOCOL)
       output.close()

   return w_matrix

In [22]:
# calculate the predicted ratings
def predict(userId, movieId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
   
    # mean_rating exists for movieId
   if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
       mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
   else:
       mean_rating = 2.5

   # calculate the rating of the given movie by the given user
   user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
   user_distinct_movies = np.unique(user_other_ratings['movieId'])
   sum_weighted_other_ratings = 0
   sum_weghts = 0
   for movie_j in user_distinct_movies:
       if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
           rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
       else:
           rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
       w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
       if w_movie_1_2.shape[0] > 0:
           user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
           sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
           sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
   if sum_weghts == 0:
       predicted_rating = mean_rating
   # sum_weights is bigger than 0
   else:
       predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts

   return predicted_rating

In [23]:
weight_matrix=build_w_matrix(adjusted_ratings, 0)

0 out of  50
build weight matrix for customer 3, movie_1 1
build weight matrix for customer 6, movie_1 1
build weight matrix for customer 10, movie_1 1
build weight matrix for customer 11, movie_1 1
build weight matrix for customer 12, movie_1 1
build weight matrix for customer 13, movie_1 1
build weight matrix for customer 14, movie_1 1
build weight matrix for customer 16, movie_1 1
build weight matrix for customer 19, movie_1 1
build weight matrix for customer 22, movie_1 1
build weight matrix for customer 23, movie_1 1
build weight matrix for customer 24, movie_1 1
build weight matrix for customer 34, movie_1 1
build weight matrix for customer 39, movie_1 1
build weight matrix for customer 47, movie_1 1
build weight matrix for customer 53, movie_1 1
build weight matrix for customer 54, movie_1 1
build weight matrix for customer 58, movie_1 1
build weight matrix for customer 59, movie_1 1
build weight matrix for customer 66, movie_1 1
build weight matrix for customer 69, movie_1 1
bu

In [25]:
weight_matrix.head()


Unnamed: 0,movie_1,movie_2,weight
0,1.0,2.0,0.363705
1,1.0,3.0,0.171153
2,1.0,4.0,0.168094
3,1.0,5.0,0.356217
4,1.0,6.0,0.113959


In [27]:
weight_matrix.shape

(2378, 3)

In [36]:
weight_matrix.head()

Unnamed: 0,movie_1,movie_2,weight
0,1.0,2.0,0.363705
1,1.0,3.0,0.171153
2,1.0,4.0,0.168094
3,1.0,5.0,0.356217
4,1.0,6.0,0.113959


In [35]:
predict(1, 3, weight_matrix, adjusted_ratings, rating_mean)

3.1344952110013615

In [39]:
distinct_movies = np.unique(ratings['movieId'])

In [41]:
distinct_movies=pd.DataFrame(distinct_movies)


In [59]:
distinct_movies.columns=["MovieId"]

In [68]:

table2=distinct_movies.copy()
table2= pd.get_dummies(table2, drop_first=True)
table2.head()

Unnamed: 0,MovieId
0,1
1,2
2,3
3,4
4,5


In [82]:
def predict(userId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
   # mean_rating exists for movieId
   Prediction_matrix_columns = ['movieID', 'Prediction']
   Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)

    for movieId in list(distinct_movies['MovieId']):
         if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
             mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
         else:
             mean_rating = 2.5

   # calculate the rating of the given movie by the given user
         user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
         user_distinct_movies = np.unique(user_other_ratings['movieId'])
         sum_weighted_other_ratings = 0
         sum_weghts = 0
         for movie_j in user_distinct_movies:
           if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
               rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
           else:
               rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
           w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
           if w_movie_1_2.shape[0] > 0:
               user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
               sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
               sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
        if sum_weghts == 0:
                predicted_rating = mean_rating
   # sum_weights is bigger than 0
        else:
                predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts
      
            Prediction_matrix = Prediction_matrix.append(pd.Series([ movieId, predicted_rating], index=Prediction_matrix_columns), ignore_index=True)

    return Prediction_matrix
    
    

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 32)

In [97]:
def predict(userId, w_matrix, adjusted_ratings, rating_mean):
   Prediction_matrix_columns = ['movieID', 'Prediction']
   Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)

   for movieId in list(distinct_movies['MovieId']):
         if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
             mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
         else:
             mean_rating = 2.5

   # calculate the rating of the given movie by the given user
         user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
         user_distinct_movies = np.unique(user_other_ratings['movieId'])
         sum_weighted_other_ratings = 0
         sum_weghts = 0
         for movie_j in user_distinct_movies:
           if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
               rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
           else:
               rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
           w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
           if w_movie_1_2.shape[0] > 0:
               user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
               sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
               sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])
   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
        if sum_weghts == 0:
             predicted_rating = mean_rating
   # sum_weights is bigger than 0
        else:
                predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts
        Prediction_matrix = Prediction_matrix.append(pd.Series([ movieId, predicted_rating], index=Prediction_matrix_columns), ignore_index=True)

    return Prediction_matrix

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 29)

In [None]:
def predict(userId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
   # mean_rating exists for movieId
   Prediction_matrix_columns = ['movieID', 'Prediction']
   Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)

    for movieId in list(distinct_movies['MovieId']):
         if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
             mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
         else:
             mean_rating = 2.5

   # calculate the rating of the given movie by the given user
         user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
         user_distinct_movies = np.unique(user_other_ratings['movieId'])
         sum_weighted_other_ratings = 0
         sum_weghts = 0
         for movie_j in user_distinct_movies:
           if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
               rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
           else:
               rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
           w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
           if w_movie_1_2.shape[0] > 0:
               user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
               sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
               sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
        if sum_weghts == 0:
                predicted_rating = mean_rating
   # sum_weights is bigger than 0
        else:
                predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts
      
            Prediction_matrix = Prediction_matrix.append(pd.Series([ movieId, predicted_rating], index=Prediction_matrix_columns), ignore_index=True)

    return Prediction_matrix
    
    

In [98]:
# calculate the predicted ratings
def predict(userId, movieId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
 Prediction_matrix_columns = ['movieID', 'Prediction']
 Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)   
    # mean_rating exists for movieId
   if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
       mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
   else:
       mean_rating = 2.5

   # calculate the rating of the given movie by the given user
   user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
   user_distinct_movies = np.unique(user_other_ratings['movieId'])
   sum_weighted_other_ratings = 0
   sum_weghts = 0
   for movie_j in user_distinct_movies:
       if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
           rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
       else:
           rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
       w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
       if w_movie_1_2.shape[0] > 0:
           user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
           sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
           sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
   if sum_weghts == 0:
       predicted_rating = mean_rating
   # sum_weights is bigger than 0
   else:
       predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts

   return predicted_rating

IndentationError: unexpected indent (<ipython-input-98-819c7c0bdb8e>, line 7)

In [130]:
def predict(userId, movieId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
   Prediction_matrix_columns = ['movieID', 'Prediction']
    Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)
    # mean_rating exists for movieId
   for movieId in list(distinct_movies['MovieId']):
    if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
        mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
    else:
        mean_rating = 2.5

   # calculate the rating of the given movie by the given user
    user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
    user_distinct_movies = np.unique(user_other_ratings['movieId'])
    sum_weighted_other_ratings = 0
    sum_weghts = 0
    for movie_j in user_distinct_movies:
        if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
            rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
        else:
            rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
        w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
        if w_movie_1_2.shape[0] > 0:
            user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
            sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
            sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
    if sum_weghts == 0:
        predicted_rating = mean_rating
    # sum_weights is bigger than 0
    else:
        predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts
    Prediction_matrix = Prediction_matrix.append(pd.Series([ movieId, predicted_rating], index=Prediction_matrix_columns), ignore_index=True)
 


IndentationError: unexpected indent (<ipython-input-130-fe6fcb3327f4>, line 4)

In [128]:
if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
       mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
   else:
       mean_rating = 2.5

   # calculate the rating of the given movie by the given user
   user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
   user_distinct_movies = np.unique(user_other_ratings['movieId'])
   sum_weighted_other_ratings = 0
   sum_weghts = 0
   for movie_j in user_distinct_movies:
       if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
           rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
       else:
           rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
       w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
       if w_movie_1_2.shape[0] > 0:
           user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
           sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
           sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
   if sum_weghts == 0:
       predicted_rating = mean_rating
   # sum_weights is bigger than 0
   else:
       predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts

    return predicted_rating

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)

In [None]:
def predict(userId, movieId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
   
    # mean_rating exists for movieId
   if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
       mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
   else:
       mean_rating = 2.5

   # calculate the rating of the given movie by the given user
   user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
   user_distinct_movies = np.unique(user_other_ratings['movieId'])
   sum_weighted_other_ratings = 0
   sum_weghts = 0
   for movie_j in user_distinct_movies:
       if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
           rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
       else:
           rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
       w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
       if w_movie_1_2.shape[0] > 0:
           user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
           sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
           sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
   if sum_weghts == 0:
       predicted_rating = mean_rating
   # sum_weights is bigger than 0
   else:
       predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts

   return predicted_rating

In [152]:
def predict(userId, w_matrix, adjusted_ratings, rating_mean):
   # fix missing mean rating which was caused by no ratings for the given movie
  Prediction_matrix_columns = ['movieID', 'Prediction']
  Prediction_matrix=pd.DataFrame(columns= Prediction_matrix_columns)
    # mean_rating exists for movieId
  for movieId in list(distinct_movies['MovieId']):
    # mean_rating exists for movieId
      if rating_mean[rating_mean['movieId'] == movieId].shape[0] > 0:
          mean_rating = rating_mean[rating_mean['movieId'] == movieId]['rating_mean'].iloc[0]
   # mean_rating does not exist for movieId(which may be caused by no ratings for the movie)
      else:
          mean_rating = 2.5

   # calculate the rating of the given movie by the given user
      user_other_ratings = adjusted_ratings[adjusted_ratings['userId'] == userId]
      user_distinct_movies = np.unique(user_other_ratings['movieId'])
      sum_weighted_other_ratings = 0
      sum_weghts = 0
      for movie_j in user_distinct_movies:
          if rating_mean[rating_mean['movieId'] == movie_j].shape[0] > 0:
             rating_mean_j = rating_mean[rating_mean['movieId'] == movie_j]['rating_mean'].iloc[0]
          else:
             rating_mean_j = 2.5
       # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
          w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
          if w_movie_1_2.shape[0] > 0:
             user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
             sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] - rating_mean_j) * w_movie_1_2['weight'].iloc[0]
             sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

   # if sum_weights is 0 (which may be because of no ratings from new users), use the mean ratings
      if sum_weghts == 0:
         predicted_rating = mean_rating
   # sum_weights is bigger than 0
      else:
         predicted_rating = mean_rating + sum_weighted_other_ratings/sum_weghts
      
      Prediction_matrix = Prediction_matrix.append(pd.Series([ movieId, predicted_rating], index=Prediction_matrix_columns), ignore_index=True)      
      
  return Prediction_matrix

In [153]:
prediction_matrix=predict(6, weight_matrix, adjusted_ratings, rating_mean)

In [154]:
prediction_matrix.head()

Unnamed: 0,movieID,Prediction
0,1.0,4.768579
1,2.0,4.120322
2,3.0,4.249836
3,4.0,3.79117
4,5.0,3.950777


In [160]:
prediction_matrix=prediction_matrix.sort_values("Prediction",ascending=False)

In [161]:
prediction_matrix.head()

Unnamed: 0,movieID,Prediction
49,50.0,5.15296
16,17.0,4.983673
27,28.0,4.950932
40,41.0,4.949163
5,6.0,4.91167
