In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Prepare data

In [2]:
data_path = "../Data/ml-25m/"
movies = pd.read_csv(data_path + "movies.csv")
ratings = pd.read_csv(data_path + "usable_ratings.csv")
genome_tags = pd.read_csv(data_path + "genome-tags.csv")
genome_scores = pd.read_csv(data_path + "genome-scores.csv")
genome_scores = pd.merge(genome_scores, genome_tags, on='tagId', how='inner')

print(f"# Movies = {ratings['movieId'].nunique()}")
print(f"# Users = {ratings['userId'].nunique()}")
print(f"# Ratings = {len(ratings)}")
print(f"# Tags = {len(genome_tags)}")

# Movies = 2613
# Users = 13671
# Ratings = 7762927
# Tags = 1128


## Set avg rating column for each movie in movies DF

In [3]:
# mean is about all users including those who did't vote for that movie
gr_movie_ratings = ratings.groupby(by="movieId")
no_users = ratings['userId'].nunique()
mean_rating_df = pd.DataFrame([], columns=["movieId", "meanRating"])
tmp_data = [[], []]

for name, group in gr_movie_ratings:
    mean_rating = group["rating"].sum() / no_users
    tmp_data[0].append(name)
    tmp_data[1].append(mean_rating)

mean_rating_df["movieId"] = tmp_data[0]
mean_rating_df["meanRating"] = tmp_data[1]

movies = pd.merge(movies, mean_rating_df, on="movieId", how="left")

## Extract base movie categories

In [4]:
categs_col_strs = list(movies["genres"])
categs_col_strs = [categ_str.split("|") for categ_str in categs_col_strs]
base_categs_list = []
for categs_str in categs_col_strs:
    base_categs_list += categs_str


base_categs_list = list(set(base_categs_list))
base_categs_list.remove("(no genres listed)")
base_categs_list.sort()
print("Number of usable categories = ", len(base_categs_list))

Number of usable categories =  19


## Prepare data for KNN the classifier

In [5]:
# piv_matrix = valid_ratings.pivot(index="movieId", columns="userId", values="rating")
piv_matrix = ratings.pivot(index="userId", columns="movieId", values="rating")
piv_matrix.fillna(0, inplace=True)

In [6]:
# fill NaN's with the mean

# for col in list(piv_matrix.columns):
#     mean = piv_matrix[col].mean()
#     piv_matrix[col].fillna(mean, inplace=True)

In [7]:
csr_data = csr_matrix(piv_matrix)
csr_data = csr_data.transpose() #! issue in col and row

In [8]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

# Recommendation

In [9]:
def get_movie_id(movie_name):
    try:
        ids = movies[movies['title'].str.lower().str.contains(movie_name.lower().strip())]["movieId"].values
        return ids
    except:
        # if does not exist reutrn []
        print("ERR")
        return []

## Rating Recommendation : Based similar ratings by all users (similar combination of ratings)

In [10]:
def recommend_based_on_similar_ratings(movie_id, no_of_similars = 5):
    # assumes a valid movie_id : int

    recommended_movies = []
    recommended_movies_with_dist = []
    list_of_columns = list(piv_matrix.columns)
    
    try:
        movie_index = list_of_columns.index(movie_id)
        distances , indices = knn.kneighbors(csr_data[movie_index],n_neighbors= no_of_similars + 1)
        tmp = zip(list(distances[0]), list(indices[0]))

        for item in tmp:
            recommended_movies_with_dist.append(item)
        
    except:
        # failed to find a movie
        # print("failed to find a movie")
        pass
            
    recommended_movies_with_dist = sorted(list(set(recommended_movies_with_dist)))

    for item in recommended_movies_with_dist:
        movie_idx = item[1]
        new_movie_id = list_of_columns[movie_idx]

        if new_movie_id == movie_id: continue

        recommended_movie_name = movies[movies["movieId"] == new_movie_id]["title"].values[0]
        recommended_movies.append({"id": new_movie_id, "name": recommended_movie_name})

    return recommended_movies

## Identify top categories of a movie

In [11]:
def get_top_categs_of_movie(movie_id, top_k = 5, is_dynamic_k=False):
    # assumes a valid movie_id : int
    # dynamic is for using a threshold for relevance not the static k
    # returns df(tag_id, tag, relevance)

    relevance_thr = 0.6 # used only in case of is_dynamic_k

    categs_sorted_by_relevance = genome_scores[genome_scores["movieId"] == movie_id].sort_values(by=['relevance'], ascending=False)\
                                 [["tagId", "tag", "relevance"]]
    
    base_categs = categs_sorted_by_relevance[categs_sorted_by_relevance["tag"].str.capitalize().isin(base_categs_list)]

        
    if is_dynamic_k == True:
        return base_categs[base_categs["relevance"] >= relevance_thr]

    else:
        top_k_categs = base_categs.iloc[:top_k]
        return top_k_categs

## Category Recommendation : Based similar categories

In [12]:
def recommend_based_on_similar_categs(movie_id, top_k= 5, is_dynamic_k=True
                                        , relevance_per_categ=0.8, min_mean_rating= 2.5
                                        , k_movies_per_categ=3, sorting_order=["meanRating", "relevance"]):
    # assumes a valid movie_id : int
    # return { categ1: [], categ2: [], .... }

    recommendations = {}

    top_categs = list(get_top_categs_of_movie(movie_id, top_k=top_k, is_dynamic_k=is_dynamic_k)["tag"])
    
    # make a DF with movies within each top_categs, each movie has relevance >= relevance_per_categ
    # sorted by the rating and relevance
    valid_movies_categ_relevanve_rating_name = genome_scores[(genome_scores["tag"].isin( top_categs )) & (genome_scores["relevance"] >= relevance_per_categ)]
    valid_movies_categ_relevanve_rating_name = pd.merge(valid_movies_categ_relevanve_rating_name
                                                    , movies[["movieId", "title", "meanRating"]]
                                                    , on="movieId")
    
    valid_movies_categ_relevanve_rating_name = valid_movies_categ_relevanve_rating_name.sort_values(by=sorting_order, ascending=False)

    # fetching and fill k_movies_per_categ
    gr_valid_movies = valid_movies_categ_relevanve_rating_name.groupby(by="tag")

    for tag, group in gr_valid_movies:
        current_movies = []

        for movie in group[:k_movies_per_categ][["movieId", "title"]].values:
            current_movies.append({"id": movie[0], "name": movie[1]})
        recommendations[tag] = current_movies

    return recommendations

In [14]:
# tmp = recommend_based_on_similar_categs(2, is_dynamic_k=False, relevance_per_categ=0.9)
# for key in tmp:
#     print("\n==== ",key," ====\n", tmp[key])

## General Recommendation : Based on similar ratings and similar categories

In [15]:
def general_recommendation(movie_id):
    # assumes a valid movie_id : int
    #* return { similar_movies: [], based_on_categs: { categ1: [], categ2: [], .... } }

    similar_movies = recommend_based_on_similar_ratings(movie_id)
    recommend_based_on_categs = recommend_based_on_similar_categs(movie_id)

    return { "similar_movies": similar_movies, "based_on_categs": recommend_based_on_categs }

## Given a user state, recommend!

In [16]:

#TODO liked list movies by our user
liked_movie_names = ["Toy story", "lion king"]

## Test

In [17]:
movie_name = "Toy story"
recommendations = general_recommendation(get_movie_id("Toy story")[0])

print(f"======= Movie: {movie_name} =======\n")
print("Similar movies based on ratings")
for movie in recommendations["similar_movies"]:
    print(movie)
print("-------------------------------")

print("Based on categories")
for category in recommendations["based_on_categs"]:
    print(category)
    for movie in recommendations["based_on_categs"][category]:
        print(movie)
    print("-------------------------------")

KeyError: "['meanRating'] not in index"

In [27]:
movies.to_csv(data_path + "movies.csv", encoding='utf-8', index=False)