In [1]:
# Importing the required libraries.
import pandas as pd
from sklearn.model_selection import train_test_split
from math import pow, sqrt

In [2]:
# Reading users dataset into a pandas dataframe object.
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/users.dat', sep='::', names=u_cols,
 encoding='latin-1')

  after removing the cwd from sys.path.


In [3]:
# users.head()

In [4]:
# Reading ratings dataset into a pandas dataframe object.
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ratings.dat', sep='::', names=r_cols,
 encoding='latin-1')

  after removing the cwd from sys.path.


In [5]:
# ratings.head()

In [6]:
# Reading movies dataset into a pandas dataframe object.
m_cols = ['movie_id', 'movie_title', 'genre']
movies = pd.read_csv('data/movies.dat', sep='::', names=m_cols, encoding='latin-1')

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# movies.head()

In [8]:
movies.genre = movies.genre.str.split('|')

genre_columns = list(set([j for i in movies['genre'].tolist() for j in i]))

# Iterating over every list to create and fill values into columns.
for j in genre_columns:
    movies[j] = 0
for i in range(movies.shape[0]):
    for j in genre_columns:
        if(j in movies['genre'].iloc[i]):
            movies.loc[i,j] = 1

In [9]:
# movies.head()

In [10]:
split_values = movies['movie_title'].str.split("(", n = 1, expand = True) 

movies.movie_title = split_values[0]
movies['release_year'] = split_values[1]

# Cleaning the release_year series and dropping 'genre' columns as it has already been one hot encoded.
movies['release_year'] = movies.release_year.str.replace(')','')
movies.drop('genre',axis=1,inplace=True)

Let's visualize all the dataframes after all the preprocessing we did.

In [11]:
movies.head()

Unnamed: 0,movie_id,movie_title,Action,Drama,Mystery,Children's,Documentary,Fantasy,Musical,Western,...,Adventure,Film-Noir,War,Crime,Romance,Horror,Sci-Fi,Comedy,Thriller,release_year
0,1,Toy Story,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1995
1,2,Jumanji,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1995
3,4,Waiting to Exhale,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1995
4,5,Father of the Bride Part II,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1995


In [12]:
# ratings.head()

In [13]:
# users.head()

In [14]:
# ratings.shape

In [15]:
#Function to get the rating given by a user to a movie.
def get_rating_(userid,movieid):
    return (ratings.loc[(ratings.user_id==userid) & (ratings.movie_id == movieid),'rating'].iloc[0])

# Function to get the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings.loc[(ratings.user_id==userid),'movie_id'].tolist())

# Function to get the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movie_id == movieid),'movie_title'].iloc[0])

# Similarity Scores

In [16]:
def distance_similarity_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    both_watch_count = 0
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            both_watch_count += 1
    if both_watch_count == 0 :
        return 0
    distance = []
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            rating1 = get_rating_(user1,element)
            rating2 = get_rating_(user2,element)
            distance.append(pow(rating1 - rating2, 2))
    total_distance = sum(distance)
    return 1/(1+sqrt(total_distance))

In [17]:
distance_similarity_score(1,310)

0.14459058185587106

Calculating Similarity Scores based on the distances have an inherent problem. We do not have a threshold to decide how much more distance between two users is to be considered for calculating whether the users are close enough or far enough. On the other side, this problem is resolved by pearson correlation method as it always returns a value between -1 & 1 which clearly provides us with the boundaries for closeness as we prefer.

In [18]:
def pearson_correlation_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    both_watch_count = []
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            both_watch_count.append(element)
    if len(both_watch_count) == 0 :
        return 0
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) for element in both_watch_count])
    
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    if denominator == 0:
        return 0
    return numerator/denominator

In [19]:
pearson_correlation_score(1,310)

0.1453526052506179

## Most Similar Users


In [20]:
def most_similar_users_(user1,number_of_users,metric='pearson'):
    '''
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score. ('pearson' or else)
    '''
    # Getting distinct user ids.
    user_ids = ratings.user_id.unique().tolist()
    
    # Getting similarity score between targeted and every other suer in the list(or subset of the list).
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score(user1,nth_user),nth_user) for nth_user in user_ids[:100] if nth_user != user1]
    else:
        similarity_score = [(distance_similarity_score(user1,nth_user),nth_user) for nth_user in user_ids[:100] if nth_user != user1]
    
    # Sorting in descending order.
    similarity_score.sort()
    similarity_score.reverse()
    
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]

# Getting Movie Recommendations for Targeted User

In [21]:
def get_recommendation_(userid):
    user_ids = ratings.user_id.unique().tolist()
    total = {}
    similariy_sum = {}
    
    # Iterating over subset of user ids.
    for user in user_ids[:100]:
        
        # not comparing the user to itself (obviously!)
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score(userid,user)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]

In [22]:
print(most_similar_users_(1,5))

[(0.875, 72), (0.8703882797784892, 66), (0.8660254037844402, 57), (0.8660254037844355, 40), (0.8164965809277261, 47)]


In [23]:
print(get_recommendation_(320))

['Contender, The ', 'Requiem for a Dream ', 'Bamboozled ', 'Invisible Man, The ', 'Creature From the Black Lagoon, The ', 'Hellraiser ', 'Almost Famous ', 'Way of the Gun, The ', 'Shane ', 'Naked Gun 2 1/2: The Smell of Fear, The ']
