### Description 

- Recommender Algorithm used - __User Based Top N Recommender System__
- Similarity Measurement Method - __Cosine Similarity__
- Dataset - __MovieLens__ dataset ( soure : Grouplens, grouplens.org )
- Recommended List extracted from - __Single User__
- __Algorithm highlights__ - 
    - first remove few ratings for one user.
    - calculate similarity measurement for all users with similarity measurement method as mentioned above.
    - find list of most similar users.
    - make a list of movies watched by similar user.(it would be the final recommendeded list) 
    - compare the recommended list with removed one, if it is same then its a 'Hit'.
    - each 'Hit' would be counted and printed in the end for each user seperately.

In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# importing csv files
df_movies = pd.read_csv("movies.csv")
df_rating = pd.read_csv("ratings.csv")

def HitRate(removedMovieList,Final_Rec_movies1):
    HitCounter = 0
    Counter = 0
    for i in Final_Rec_movies1['movieId']:
        if i in removedMovieList:
            HitCounter = HitCounter+1
            Counter = Counter+1
        else:
            Counter = Counter+1
    if Counter == 0: 
        return 0
    else:
        return HitCounter/Counter

    
def UserId_wise_HitRate(arr_remove_movieId,selected_user_A,similar_user_Corr_B):
    movies_liked_B = df_ratings_with_MovieName[df_ratings_with_MovieName['userId']==similar_user_Corr_B][['rating','movieId']]
    movies_liked_A = df_ratings_with_MovieName[df_ratings_with_MovieName['userId']==selected_user_A][['movieId']]

    Final_Rec_movies = pd.merge(movies_liked_B,movies_liked_A,how='left',on='movieId')
    Final_Rec_movies.rename(columns={'rating':'Rating_by_B'}, inplace = True)

    df_movie_meanRating = df_ratings_with_MovieName.groupby('movieId')['rating'].agg(['mean','count'])
    df_movie_meanRating.reset_index(inplace=True)
    df_movie_meanRating.rename(columns={'mean':'Average rating','count':'NoOfTimesWatched'}, inplace = True)

    Final_Rec_movies1 = Final_Rec_movies.merge(df_movie_meanRating,how ='left',on='movieId')
    # Final_Rec_movies1
    return HitRate(arr_remove_movieId,Final_Rec_movies1)

# merging both files
df_ratings_with_MovieName = df_rating.merge(df_movies, on='movieId',how = 'left')

# list of active users who have atleast rated more than 50 movies and less than 250( It could be modified )
df_active_user_list = df_ratings_with_MovieName.groupby('userId')['rating'].agg(['mean','count'])
df_active_user_50_250 = df_active_user_list[(df_active_user_list['count']>50) & (df_active_user_list['count']<250)]

df_active_user_50_250.sort_values(by = 'count',ascending = False).head(15)




Unnamed: 0_level_0,mean,count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
500,2.981928,249
295,3.944915,236
149,3.47619,231
240,3.932609,230
270,3.724891,229
442,4.224444,225
236,4.015625,224
177,3.96875,224
303,3.639013,223
408,3.644144,222


In [41]:
# This section will remove few movie ratings, from selected user, e.g. user 123 has watched movie "star wars" and rated it 4. 
# --now below algorithm will remove rating of 'star wars' given by user 123. 
# And one more important point to consider is that , rating which are removed should be pretty famous 
# --and watched by significant users.

# selected_user_A = 295

def Complete_Execution_Hitrate(selected_user_A,df_ratings_with_MovieName):

    movies_liked_by_A = df_ratings_with_MovieName[df_ratings_with_MovieName['userId']==selected_user_A]

    df_groupby_movieId_count = df_ratings_with_MovieName.groupby('movieId')['rating'].count()
    df_groupby_movieId_count = pd.DataFrame(df_groupby_movieId_count)
    df_groupby_movieId_count.rename(columns={'rating':'rating count'},inplace=True)
    df_groupby_movieId_count.reset_index(inplace=True)

    df_top_movies_liked_by_A = movies_liked_by_A.merge(df_groupby_movieId_count,on='movieId',how='left')
    df_top_movies_liked_by_A  = df_top_movies_liked_by_A[df_top_movies_liked_by_A['rating']>4.0]

    arr_remove_movieId = np.array(df_top_movies_liked_by_A.sort_values(by='rating count', ascending= False).iloc[:10,1])

    df_Index_remove_from_User_A = df_ratings_with_MovieName[(df_ratings_with_MovieName['userId']==selected_user_A) & (df_ratings_with_MovieName['movieId'].isin(arr_remove_movieId))]
    df_ratings_with_MovieName.drop(index=df_Index_remove_from_User_A.index,inplace=True)

    # Correlation calculation
    # First pivot table

    df_pt_rating_title_userid =df_ratings_with_MovieName.pivot_table(index='title',columns='userId',values='rating')
    
    #     df_correlation_matrix = df_pt_rating_title_userid.corr()
    df_pt_rating_title_userid.fillna(0,inplace=True)
    
    df_correlation_matrix = cosine_similarity(df_pt_rating_title_userid.T,df_pt_rating_title_userid.T)
    df_correlation_matrix = pd.DataFrame(df_correlation_matrix, columns = df_pt_rating_title_userid.columns,index = df_pt_rating_title_userid.columns)
   
    df_correlation_matrix.columns = df_correlation_matrix.columns.astype(str)
    df_correlation_matrix_A = df_correlation_matrix[str(selected_user_A)]

    df_correlation_matrix_A = pd.DataFrame(df_correlation_matrix_A)
    df_correlation_matrix_A.rename(columns={str(selected_user_A):"CorrelationScore"}, inplace =True)


    df_similarUsers_like_A_active = df_correlation_matrix_A.join(df_active_user_list,how='left')
    df_similarUsers_like_A_active.reset_index(inplace= True)

    # removing user A, from recommendation list.
    df_similarUsers_like_A_active.drop(df_similarUsers_like_A_active[df_similarUsers_like_A_active['userId']==selected_user_A].index,inplace=True)
    list_of_similar_user = np.array(df_similarUsers_like_A_active.sort_values(by ='CorrelationScore', ascending = False ).iloc[:10,0])
    hitrate_result = [UserId_wise_HitRate(arr_remove_movieId,selected_user_A,x) for x in list_of_similar_user]
    df_hitrate_result = pd.DataFrame({'UserId':list_of_similar_user,
                                      'HitRate':hitrate_result})

    df_hitrate_result.loc[len(df_hitrate_result)] = ['Total',df_hitrate_result['HitRate'].sum()]
    return df_hitrate_result[df_hitrate_result['UserId']=='Total']['HitRate'].iloc[0]

In [43]:
%%time
df_active_user_50_250.reset_index(inplace=True)
list_userId_to_be_recommended = np.array(df_active_user_50_250.sort_values(by='count',ascending = False)['userId'][:10])
Final_HitRate = [Complete_Execution_Hitrate(x,df_ratings_with_MovieName) for x in list_userId_to_be_recommended]
df_final_result_with_HitRate = pd.DataFrame({'UserId':list_userId_to_be_recommended,'HitRate':Final_HitRate})
df_final_result_with_HitRate


Wall time: 5.44 s


Unnamed: 0,UserId,HitRate
0,500,0.014816
1,295,0.260875
2,149,0.029747
3,240,0.200712
4,270,0.218431
5,442,0.27211
6,236,0.151572
7,177,0.132681
8,303,0.181101
9,408,0.103574
