In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movie_metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
movie_metadata = movie_metadata.drop_duplicates(subset='id')
movie_metadata.shape

(45435, 24)

In [3]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
ratings.shape

(26024289, 3)

In [4]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

movie_metadata = movie_metadata.drop([19730, 29503, 35587])

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
non_integer_rows = movie_metadata[~movie_metadata['id'].astype(str).str.isdigit()]
movie_metadata = movie_metadata.drop(non_integer_rows.index)
movie_metadata['id'] = movie_metadata['id'].astype('int')

movie_metadata = movie_metadata.merge(credits, on='id', how='inner')
movie_metadata = movie_metadata.merge(keywords, on='id', how='inner')
movie_metadata.shape

(46488, 27)

In [None]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(literal_eval)


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []


movie_metadata['director'] = movie_metadata['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(get_list)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(clean_data)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


movie_metadata['soup'] = movie_metadata.apply(create_soup, axis=1)
movie_metadata[['soup']].head()

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_metadata['soup'])
count_matrix.shape

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

In [None]:
movie_metadata = movie_metadata.reset_index()
indices = pd.Series(movie_metadata.index, index=movie_metadata['id'])
indices = indices.sort_values(ascending=False)

In [None]:
def estimate(user_id, movies):
    user_ratings = pd.merge(ratings[ratings['userId'] == user_id],
                            movie_metadata[['id', 'title']],
                            left_on='movieId',
                            right_on='id',
                            how='inner')
    user_ratings = user_ratings.drop('id', axis=1)
    user_ratings['movieId'] = user_ratings['movieId'].astype(int)

    reviewed_movie_ids = set(user_ratings['movieId'])
    pred_series = []
    for movie_id in movies['id']:
        if movie_id in reviewed_movie_ids:
            continue
        simTotal = weightedSum = 0
        sim_scores_row = list(enumerate(cosine_sim[indices[movie_id]]))
        for reviewed_id in reviewed_movie_ids:
            idx = indices[reviewed_id]
            if len(sim_scores_row) < 46488:
                continue
            sim_score = sim_scores_row[idx]
            simTotal += float(sim_score[1])
            weightedSum += float(sim_score[1]) * float(user_ratings[user_ratings['movieId'] == reviewed_id]['rating'])
          
        
        predictedRating = weightedSum / simTotal if simTotal != 0 else 0
        pred_series.append((movie_id, predictedRating))

    pred_series = sorted(pred_series, key=lambda x: x[1], reverse=True)[:10]
    return pred_series


In [None]:
def get_recommendations_single_movie(id, rating, cosine_sim=cosine_sim):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, score * rating) for i, score in sim_scores]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    movie_metadata1 = movie_metadata[['id', 'title']].iloc[movie_indices]
    sim_scores_values = [i[1] for i in sim_scores]
    movie_metadata1['similarity_score'] = sim_scores_values
    return movie_metadata1


In [None]:
def get_recommendations(user_id):
    pred_series = []
    user_ratings = ratings.loc[ratings['userId'] == user_id]
    merged_ratings = pd.merge(user_ratings, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
    top_10_user_ratings = merged_ratings.sort_values(by='rating', ascending=False).head(10)
    top_10_user_ratings['movieId'] = top_10_user_ratings['movieId'].astype(int)
    top_10_movie_ids = top_10_user_ratings[['movieId', 'rating']]
    for index, row in top_10_movie_ids.iterrows():
        movie_id = row['movieId']
        rating = row['rating']
        pred_series.append(get_recommendations_single_movie(movie_id, rating))

    df = pd.concat(pred_series)
    return pd.merge(pd.DataFrame(estimate(user_id, df.sort_values(by='similarity_score', ascending=False)), columns=['id', 'predicted_rating']), movie_metadata[['id', 'title']], on='id', how='inner')

In [None]:
get_recommendations(222)