In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# First we load the dataset
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv("ratings.csv")
tag_relevance = pd.read_csv('tag_relevance.csv')
user_info = pd.read_csv('user_info.csv')
genres = pd.read_csv('genres.csv')

In [3]:
# Helper functions which we will use later

# Drops columns of a dataframe 
def drop_df_columns(df, columns_to_drop, axis = 1, inplace=True):
    return df.drop(columns_to_drop, axis=axis, inplace=inplace)

# Returns genre of a movie based on its title 
def add_genres(genre, movie_genres):
    if genre in movie_genres.split('|'): return 1
    else: return 0

# Returns released year of the movie    
def add_year(movie_title):
    year = movie_title[-5:-1]
    if year.isnumeric() == True: return int(year)
    else: return int(1899)

# Returns year group of a movie    
def group_by_year(year):
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2010 <= year): return 5
    else: return 0
    
# Set columns of a dataframe
def set_columns(df, new_columns):
    df.columns = new_columns
    return df

# Returns the rating group of movies based on rating count 
def group_by_ratings(count):
    if (count <= 1): return 0
    elif (2 <= count <= 10): return 1
    elif (11 <= count <= 100): return 2
    elif (101 <= count <= 1000): return 3
    elif (1001 <= count <= 5000): return 4
    elif (count >= 5001): return 5
    else: return 0
    
# Returns similar movies based on input movie    
def get_similar_movies(similarity_data, movieId):
    df = similarity_data.loc[similarity_data.index == movieId].reset_index(). \
    melt(id_vars='movieId', var_name='similar_movieId', value_name='relevance'). \
    sort_values('relevance', axis=0, ascending=False)[1:6]
    return df

# Returns a dataframe with genres of a given movie
def get_genres_by_movie(movies_genres_data, movieId):
    movie = movies_genres_data[movies_genres_data['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [genre.split('|') for genre in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

# Returns favorite genre for a given user
def get_favorite_genre(genres_by_user, userId):
    user = genres_by_user[genres_by_user['userId']==userId]
    genres = user['genres'].tolist()
    movie_list = [b for a in [genre.split('|') for genre in genres] for b in a]
    counter = Counter(movie_list)
    return counter.most_common(1)[0][0]

# Exports the data to a csv
def export_to_csv(df, file_name_path, sep=',', header=True, index=False):
    df.to_csv(file_name_path, sep=sep, header=header, index=index)
    
def group_by_age(age):
    if age in range(1,11): return 0
    elif age in range(11,21): return 1
    elif age in range(21,31): return 2
    elif age in range(31,41): return 3
    elif age in range(41,51): return 4
    elif (age >= 51): return 5
    else: return 0

In [4]:
# Create pivot table to compare the movies relevance based on tags
relevance_scores_pivot = tag_relevance.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index()

In [5]:
# Join the above table with actual movies dataset by movieId
movies_by_tag_relevance = movies.merge(relevance_scores_pivot, left_on='movieId', right_on='movieId', how='left')
movies_by_tag_relevance = movies_by_tag_relevance.fillna(0)
# movies_by_tag_relevance.head()

In [6]:
# Remove unnecessary columns
drop_df_columns(movies_by_tag_relevance, ['title','genres'])

In [7]:
movies_by_tag_relevance.set_index('movieId', inplace=True)
# movies_by_tag_relevance

In [8]:
# Create a duplicate of movies dataset to add generes to each movie
movies_by_genres = movies.copy()

# Now we will categorize movies by genres
for genre in genres['genres'].tolist():
    movies_by_genres[genre] = movies_by_genres.apply(lambda movie: add_genres(genre, movie['genres']), axis=1)

In [9]:
# Remove unnecessary columns
drop_df_columns(movies_by_genres, ['title','genres'])
movies_by_genres.set_index('movieId', inplace=True)

In [10]:
# We will create a separate column for year which the movie was released
# This information is already available in the movie title
# We just split it to get the year
key = 'year'
movies_by_year = movies.copy()
# We strip the movie title to remove an whitespaces and add released year to the dataframe
movies_by_year[key] = movies_by_year.apply(lambda movie: add_year(movie['title'].strip()), axis=1)
# movies_by_year

In [11]:
# We group movies by their year
movies_by_year['year_group'] = movies_by_year.apply(lambda movie: group_by_year(movie['year']), axis=1)

In [12]:
# We don't need the title, genres, and year anymore, so we will drop all of them
drop_df_columns(movies_by_year, ['title','genres','year'])

In [13]:
# We calculate the number of ratings alognside the ratings average for each movie
ratings_aggregate = ratings.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
ratings_aggregate = set_columns(ratings_aggregate, ['movieId','#ratings', 'rating_avg'])

In [14]:
# We categorize movies into groups based on ratings count
ratings_aggregate['rating_group'] = ratings_aggregate.apply(lambda movie: group_by_ratings(int(movie['#ratings'])), axis=1)

In [15]:
# We don't need the ratings count anymore, so we will drop it
drop_df_columns(ratings_aggregate, ['#ratings'])

In [16]:
# Join the movies by year with ratings data
movies_by_ratings = movies_by_year.merge(ratings_aggregate, left_on='movieId', right_on='movieId', how='left')
movies_by_ratings = movies_by_ratings.fillna(0)
movies_by_ratings.set_index('movieId', inplace=True)

In [17]:
# We calculate the cosine similarity based on tag relevance values
tags_similarity = cosine_similarity(movies_by_tag_relevance.values)

In [18]:
# We calculate the cosine similarity based on genres
genres_similarity = cosine_similarity(movies_by_genres.values)

In [19]:
# We calculate the cosine similarity based on movie ratings
ratings_similarity = cosine_similarity(movies_by_ratings.values)

In [20]:
# We calculate the similarity by assigning some weight as shown below
similarity = (tags_similarity * 0.4) + (genres_similarity * 0.3) + (ratings_similarity * 0.3)

In [21]:
# Create a dataframe with the similarity score
similarity_data = pd.DataFrame(similarity, columns=movies_by_tag_relevance.index.values, index=movies_by_tag_relevance.index)

In [22]:
movies_similarity = pd.DataFrame(columns=['movieId','similar_movieId','relevance'])
for movie in similarity_data.index.tolist():
    movies_similarity = movies_similarity.append(get_similar_movies(similarity_data, movie))

In [23]:
# Recommends movie based on the input movie
def recommend_movies(movieId):
    df = similarity_data.loc[similarity_data.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='similar_movieId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    df['similar_movieId'] = df['similar_movieId'].astype(int)
    similar_movies = movies.merge(df, left_on='movieId', right_on='similar_movieId', how='inner'). \
                sort_values('relevance', axis=0, ascending=False). \
                loc[: , ['movieId_y','title','genres']]. \
                rename(columns={ 'movieId_y': "movieId" })
    return similar_movies

In [24]:
recommend_movies(100)

Unnamed: 0,movieId,title,genres
0,100,"Juror, The (1996)",Drama|Thriller
3,100,Murder in the First (1995),Drama|Thriller
4,100,"Firm, The (1993)",Drama|Thriller
1,100,Disclosure (1994),Drama|Thriller
2,100,Death and the Maiden (1994),Drama|Thriller


In [25]:
# We extract the list of users from ratings provided
users = pd.DataFrame(ratings['userId'].unique(), columns=['userId'])

In [26]:
# We create a relationship between movies and ratings
movies_without_genres = movies.drop('genres', axis = 1)
ratings_aggregate_without_group = drop_df_columns(ratings_aggregate, ['rating_group'], 1, False)
movies_by_rating_avg = pd.DataFrame(columns=['movieId','title','rating_avg'])
movies_by_rating_avg = movies_without_genres.merge(ratings_aggregate_without_group, left_on='movieId', right_on='movieId', how='left')

In [27]:
# We will use ratings data to create a relationship between movies and users
movies_by_users = drop_df_columns(ratings, ['timestamp'], 1, False)

In [28]:
# Gets the genres of a movie
movies_genres_data = drop_df_columns(movies, ['title'], 1, False)
movies_genres=pd.DataFrame(columns=['movieId','genres'])
for movie in movies_genres_data['movieId'].tolist():
    movies_genres=movies_genres.append(get_genres_by_movie(movies_genres_data, movie))

In [29]:
# Gets genres by user so that we can calculate favorite genres of a user
genres_by_user = ratings.merge(movies, left_on='movieId', right_on='movieId', how='left')
drop_df_columns(genres_by_user, ['movieId','rating','timestamp','title'])

In [None]:
# Calculate user's favorite genre
users_favorite_genres = pd.DataFrame(columns=['userId','genre'])
for userId in genres_by_user['userId'].tolist():
    users_favorite_genres = users_favorite_genres.append(pd.DataFrame([[userId, get_favorite_genre(genres_by_user, userId)]], columns=['userId','genre']))
users_favorite_genres

In [39]:
# Now we format the user information for age
import math

user_info['age_group'] = user_info.apply(lambda user: group_by_age(math.floor(user['age'])), axis=1)
drop_df_columns(user_info, ['random', 'occupation'])

KeyError: "['random' 'occupation'] not found in axis"

In [None]:
user_ages_aggregate = user_info.groupby(['age_group']).agg({'age': [np.size, np.mean]}).reset_index()

user_ages_aggregate

In [None]:
# user_ages_aggregate.columns = ['age_group']

In [345]:
export_to_csv(users, './cleaned_data/users.csv')
export_to_csv(movies_by_year, './cleaned_data/movies.csv')
export_to_csv(movies_by_users, './cleaned_data/movies_by_users.csv')
export_to_csv(movies_genres, './cleaned_data/movies_genres.csv')
export_to_csv(users_favorite_genres, './cleaned_data/users_favorite_genres.csv')
export_to_csv(movies_similarity, './cleaned_data/movies_similarity.csv')
export_to_csv(user_info, './cleaned_data/users_by_age.csv')
export_to_csv(user_ages_aggregate, './cleaned_data/user_age_groups.csv')