### Description 

- Recommender Algorithm used - __User Based Top N Recommender System__
- Similarity Measurement Method - __Pearson Correlation__
- Dataset - __MovieLens__ dataset ( soure : Grouplens, grouplens.org )
- Recommended List extracted from - __Multi User__
- __Algorithm highlights__ - 
    - first remove few ratings for one user.
    - calculate similarity measurement for all users with similarity measurement method as mentioned above.
    - find list of most similar users.
    - make a list of movies watched by similar users(it would be the final recommendeded list). 
    - compare the recommended list with removed one, if it is same then its a 'Hit'.
    - each 'Hit' would be counted and printed in the end for each user seperately.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# importing csv files
df_movies = pd.read_csv("movies.csv")
df_rating = pd.read_csv("ratings.csv")

# merging both files
df_Master_Dataframe = df_rating.merge(df_movies, on='movieId',how = 'left')

# list of active users who have atleast rated more than 50 movies and less than 250
# This window is selected because to give a leverage for recommendations, 
# there is no point selecting a user has already seen most of the movies.
df_active_user_list = df_Master_Dataframe.groupby('userId')['rating'].agg(['mean','count'])
df_active_user_50_250 = df_active_user_list[(df_active_user_list['count']>600) & (df_active_user_list['count']<700)]

df_active_user_50_250.sort_values(by = 'count',ascending = False).head(15)
df_active_user_50_250.reset_index(inplace=True)

list_userId_to_be_Evaluated = np.array(df_active_user_50_250.sort_values(by='count',ascending = False)['userId'][:10])

def Complete_Execution_Hitrate(selected_user_A,df_Master_Dataframe):

    #Filtering out the dataframe for specific userid
    movies_liked_by_A = df_Master_Dataframe[df_Master_Dataframe['userId']==selected_user_A]

    #  Popular movies list   
    df_popular_movies = df_Master_Dataframe.groupby('movieId')['rating'].count()
    df_popular_movies = pd.DataFrame(df_popular_movies)
    df_popular_movies.rename(columns={'rating':'rating count'},inplace=True)
    df_popular_movies.reset_index(inplace=True)
    df_popular_movies.sort_values(by= 'rating count',ascending = False).head()
    
    # Popular movies rated by the user A ("selected_user_A") and rated more than 4
    df_popular_movies_A = movies_liked_by_A.merge(df_popular_movies,on='movieId',how='left')
    df_popular_movies_A = df_popular_movies_A[df_popular_movies_A['rating']>4.0]
    
    # Now, selecting few ratings which would be removed from Rating matrix for HIt rate calculation 
    # ** picking movieid which are rated number of times, and their corresponding ratings would be removed.
    arr_remove_movieId = np.array(df_popular_movies_A.sort_values(by='rating count', ascending= False).iloc[:20,1])
    df_Index_remove_from_User_A = df_Master_Dataframe[(df_Master_Dataframe['userId']==selected_user_A) 
                                                      & (df_Master_Dataframe['movieId'].isin(arr_remove_movieId))]
    df_Master_Dataframe_temp = df_Master_Dataframe.drop(index=df_Index_remove_from_User_A.index)
    
    # Creating pivot table for correlation calculation
    df_pivot_table =df_Master_Dataframe_temp.pivot_table(index='title',columns='userId',values='rating')
    
    df_correlation_matrix = df_pivot_table.corr(method='pearson')

    
    # Picking specific user A data from Correlation Matrix     
    df_correlation_matrix.columns = df_correlation_matrix.columns.astype(str)
    df_correlation_matrix_A = df_correlation_matrix[str(selected_user_A)]
    df_correlation_matrix_A = pd.DataFrame(df_correlation_matrix_A)
    df_correlation_matrix_A.rename(columns={str(selected_user_A):"CorrelationScore"}, inplace =True)
    
    ## Merging Movie Count data in Correlation coefficient matrix for user A
    df_similarUsers_like_A_active = df_correlation_matrix_A.join(df_active_user_list,how='left')
    df_similarUsers_like_A_active.reset_index(inplace= True)
    df_similarUsers_like_A_active.head()
    
    # Remove user A itself from the similar users list
    df_similarUsers_like_A_active.drop(df_similarUsers_like_A_active[df_similarUsers_like_A_active['userId']
                                                                     ==selected_user_A].index,inplace=True)
    
    # List of users which are similar of user A - only pick top 10 users which have best correlation score
    list_of_similar_user = np.array(df_similarUsers_like_A_active.sort_values
                                    (by ='CorrelationScore', ascending = False ).iloc[:10,0])
    
    # List Comprehension applied for each 10 of similar user      
#     hitrate_result = [ UserId_wise_HitRate(arr_remove_movieId,selected_user_A,x,df_Master_Dataframe_temp,df_popular_movies)
#                       for x in list_of_similar_user]
    
#     new line of code added for multiple user # 28th april 2020
    hitrate_result = UserId_wise_HitRate(arr_remove_movieId,selected_user_A,list_of_similar_user,df_Master_Dataframe_temp,df_popular_movies)
    
 
    return hitrate_result
    
def UserId_wise_HitRate(arr_remove_movieId,selected_user_A,list_of_similar_user,df_Master_Dataframe_temp,df_popular_movies):
    
    # List of movies liked by user A anb similar users
    df_movies_liked_by_similarUser = pickBestRatedMovies(list_of_similar_user,df_Master_Dataframe_temp)
    
#     movies_liked_B = df_Master_Dataframe_temp[df_Master_Dataframe_temp['userId']==similar_user_of_A_B][['rating','movieId']]
    movies_liked_A = df_Master_Dataframe_temp[df_Master_Dataframe_temp['userId']==selected_user_A][['movieId']]
    
    # To get the list of Movies which are seen B but not by A
    Final_Rec_movies_to_A = pd.merge(df_movies_liked_by_similarUser,movies_liked_A,how='left',on='movieId')
    Final_Rec_movies_to_A.rename(columns={'rating':'Rating_by_B'}, inplace = True)
    
    # For more fine result add one more column 'no of times movie watched' corresponding to movieid     
    Final_Recommendation = Final_Rec_movies_to_A.merge(df_popular_movies,how ='left',on='movieId')
    Final_Recommendation = Final_Recommendation[Final_Recommendation['Rating_by_B']>=3.5].sort_values(by='Rating_by_B', ascending = False).head(20)

    return HitRate(arr_remove_movieId,Final_Recommendation)

def pickBestRatedMovies(list_user,df_Master_Dataframe_temp):
    
    temp_df = pd.DataFrame(columns=['rating','movieId'])

    for i in list_user:
        movies_liked = df_Master_Dataframe_temp[df_Master_Dataframe_temp['userId']==i][['rating','movieId']].sort_values(by='rating',ascending=False).iloc[:5]
        temp_df=pd.concat([temp_df,movies_liked],axis=0)
    
    return temp_df

def HitRate(removedMovieList,Final_Recommendation):
    HitCounter = 0
    Counter = 0
    matched_movies=[]
    for i in Final_Recommendation['movieId']:
        if i in removedMovieList:
            HitCounter = HitCounter+1
            Counter = Counter+1
            matched_movies.append(i)
        else:
            Counter = Counter+1
    if Counter == 0: 
        return 0
    else:
        return HitCounter/len(removedMovieList)
    

In [2]:
%%time
Final_HitRate = [Complete_Execution_Hitrate(x,df_Master_Dataframe) for x in list_userId_to_be_Evaluated]
df_final_result_with_HitRate = pd.DataFrame({'UserId':list_userId_to_be_Evaluated,'HitRate':Final_HitRate})
df_final_result_with_HitRate

Wall time: 4min 39s


Unnamed: 0,UserId,HitRate
0,461,0.65
1,232,0.05
2,102,0.05
3,262,0.0
4,475,0.1
5,306,0.1
6,119,0.1
7,654,0.2
8,358,0.2
9,529,0.25
