Assignment 4

Student : Balasubramanian A.R
    
Batch : M.Sc Computer science, second year

## Recommendation system

In [1]:
import pandas as pd
import numpy as np

#Loading the data
df = pd.read_csv("ratings.csv", delimiter = ",") #Ratings data
moviedf = pd.read_csv("movies.csv",delimiter = ",") #Movies corresponding to movieids

users = set(df['userId']) #Set of users
movies = set(df['movieId']) #Set of movies

In [2]:
#Computing euclidean distance
def euclidean_dist(i,j):
    
    Xi = df.loc[df['userId'] == i] #All rows having userId i
    Xj = df.loc[df['userId'] == j] #All rows having userId j
    comm = pd.merge(Xi,Xj,on=['movieId']) #All common movies between i and j
    
    if(len(comm.index) == 0):
        return 0
    
    #Compute inverse of distance
    dist = sum(pow(comm['rating_y'] - comm['rating_x'],2))
    inv_dist = 1/(1+dist)
    return inv_dist

In [3]:
#Computing Pearson score
def pearson_dist(i,j):
    
    Xi = df.loc[df['userId'] == i]
    Xj = df.loc[df['userId'] == j]
    comm = pd.merge(Xi,Xj,on=['movieId'])
    
    n = len(comm.index)
    
    if(n == 0):
        return 0

    #Pearson distance
    
    sumi = comm['rating_x'].sum()
    sumj = comm['rating_y'].sum()
    
    sqsumi = sum(comm['rating_x'] * comm['rating_x'])
    sqsumj = sum(comm['rating_y'] * comm['rating_y'])
    
    psum = sum(comm['rating_x'] * comm['rating_y'])
    
    num = psum - (sumi*sumj/n)
    den = np.sqrt((sqsumi - pow(sumi,2)/n) * (sqsumj - pow(sumj,2)/n))
    if(den == 0):
        return 0

    return(num/den)


In [4]:
#Getting the top n Matches for a person
def topMatches(person,n):
    scores = [(pearson_dist(person,user),user) 
                 for user in users if users != person] #Computing distance of every user with person
    
    scores.sort()
    scores.reverse()
    return scores[:n]

In [5]:
#Recommendations of unseen movies for the person along with the expected rating that the person would give 
#upon watching the movie

def recos(person):
    
    totals = pd.DataFrame(columns = movies,data=[len(movies)*[0]])
    sims = pd.DataFrame(columns = movies,data=[len(movies)*[0]])
    
    for user in users: #Going over all users
        if(user == person): 
            continue
            
        dist = pearson_dist(person,user) #Using Pearson measure to compute distance
        if(dist <= 0): #Go back if distance is negative or zero
            continue
        
        Xi = df.loc[df['userId'] == person] #Set of movies of person
        Xj = df.loc[df['userId'] == user] #Set of movies of user
        comm = pd.merge(Xi,Xj.reset_index(),on=['movieId']) #Common movies
        diff = Xj.drop(comm['index']) #Movies which person has not seen
        
        #Ratings for movies
        temp1 = list(diff['rating'] * dist)
        totals[diff['movieId']] = totals[diff['movieId']] + temp1
        
        temp2 = list(diff['rating'] * 0 + dist)
        sims[diff['movieId']] = sims[diff['movieId']] + temp2
        
    ans = [(totals[movie][0]/sims[movie][0],movie) for movie in movies if(sims[movie][0] != 0)]
    ans.sort()
    ans.reverse()
    return ans[:10] #Returning top 10 movies

In [6]:
x = np.random.choice(list(users)) #Picking a random person
print("User id",x)

y = recos(x) #Suggestion of movies, but the movies are represented only by movieIds
for (rating,movieId) in y:
    X = moviedf.loc[moviedf['movieId'] == movieId].get_values() #Getting the row corresponding to movieId
    print(rating,X[0][1]) #X[0][1] is the movie name

User id 203
5.000000000000001 Belle époque (1992)
5.000000000000001 Mephisto (1981)
5.000000000000001 Bent (1997)
5.000000000000001 Assignment, The (1997)
5.000000000000001 Lesson Faust (1994)
5.0 Won't You Be My Neighbor? (2018)
5.0 De platte jungle (1978)
5.0 Blue Planet II (2017)
5.0 Loving Vincent (2017)
5.0 Black Mirror
