In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader
from sklearn.model_selection import train_test_split
from evaluation import calculate_mse_and_rmse

In [2]:
movie_metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
movie_metadata = movie_metadata.drop_duplicates(subset='id')
movie_metadata.shape

(45435, 24)

In [3]:
ratings = pd.read_csv('data/ratings_small.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
reader = Reader()
ratings_by_users = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
ratings, ratings_test = train_test_split(ratings, test_size=0.2, random_state=42)
ratings.shape

(80003, 3)

In [4]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

movie_metadata = movie_metadata.drop([19730, 29503, 35587])

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
non_integer_rows = movie_metadata[~movie_metadata['id'].astype(str).str.isdigit()]
movie_metadata = movie_metadata.drop(non_integer_rows.index)
movie_metadata['id'] = movie_metadata['id'].astype(int)
movie_metadata['id'] = movie_metadata['id'].astype('int')

movie_metadata = movie_metadata.merge(credits, on='id', how='inner')
movie_metadata = movie_metadata.merge(keywords, on='id', how='inner')
movie_metadata.shape

(46488, 27)

In [5]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(literal_eval)


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []


movie_metadata['director'] = movie_metadata['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(get_list)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,id,title,cast,director,keywords,genres
0,862,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,8844,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,31357,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [6]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(clean_data)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,id,title,cast,director,keywords,genres
0,862,Toy Story,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,8844,Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,15602,Grumpier Old Men,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,31357,Waiting to Exhale,"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,11862,Father of the Bride Part II,"[stevemartin, dianekeaton, martinshort]",charlesshyer,"[baby, midlifecrisis, confidence]",[comedy]


In [7]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


movie_metadata['soup'] = movie_metadata.apply(create_soup, axis=1)
movie_metadata[['soup']].head()

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence stevemartin dian...


In [8]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_metadata['soup'])
count_matrix.shape

(46488, 73881)

In [9]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(46488, 46488)

In [10]:
movie_metadata = movie_metadata.reset_index()
indices = pd.Series(movie_metadata.index, index=movie_metadata['id'])
indices = indices.sort_values(ascending=False)

In [11]:
def estimate(user_id, movies):
    user_ratings = pd.merge(ratings[ratings['userId'] == user_id],
                            movie_metadata[['id', 'title']],
                            left_on='movieId',
                            right_on='id',
                            how='inner')
    user_ratings = user_ratings.drop('id', axis=1)
    user_ratings['movieId'] = user_ratings['movieId'].astype(int)

    reviewed_movie_ids = set(user_ratings['movieId'])
    pred_series = []
    for movie_id in movies['id']:
        if movie_id in reviewed_movie_ids:
            continue
        simTotal = weightedSum = 0
        sim_scores_row = list(enumerate(cosine_sim[indices[movie_id]]))
        for reviewed_id in reviewed_movie_ids:
            idx = indices[reviewed_id]
            if len(sim_scores_row) < 46488:
                continue
            if type(idx) != np.int64:
                continue
            sim_score = sim_scores_row[idx]
            simTotal += float(sim_score[1])
            weightedSum += float(sim_score[1]) * float(user_ratings[user_ratings['movieId'] == reviewed_id]['rating'])
        predictedRating = weightedSum / simTotal if simTotal != 0 else 0
        pred_series.append((movie_id, predictedRating))

    pred_series = sorted(pred_series, key=lambda x: x[1], reverse=True)[:10]
    return pred_series

In [20]:
def get_recommendations_single_movie(id, rating, cosine_sim=cosine_sim):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, score * rating) for i, score in sim_scores]
    sim_scores = sorted(sim_scores, key=lambda x: x[1].any(), reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    movie_metadata1 = movie_metadata[['id', 'title']].iloc[movie_indices]
    sim_scores_values = [i[1] for i in sim_scores]
    movie_metadata1['similarity_score'] = sim_scores_values
    return movie_metadata1

In [36]:
def get_recommendations(user_id):
    pred_series = []
    user_ratings = ratings.loc[ratings['userId'] == user_id]
    merged_ratings = pd.merge(user_ratings, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
    top_10_user_ratings = merged_ratings.sort_values(by='rating', ascending=False).head(10)
    top_10_user_ratings['movieId'] = top_10_user_ratings['movieId'].astype(int)
    top_10_movie_ids = top_10_user_ratings[['movieId', 'rating']]
    for index, row in top_10_movie_ids.iterrows():
        movie_id = row['movieId']
        rating = row['rating']
        pred_series.append(get_recommendations_single_movie(movie_id, rating))
    df = pd.concat(pred_series)
    df = df.drop_duplicates(subset=['id'])
    df = df[pd.to_numeric(df['similarity_score'], errors='coerce').notnull()]
    df = df.sort_values(by='similarity_score', ascending=False)
    return pd.merge(pd.DataFrame(estimate(user_id, df),
                                 columns=['id', 'predicted_rating']), movie_metadata[['id', 'title']], on='id',
                    how='inner')

In [37]:
get_recommendations(222)

Unnamed: 0,id,predicted_rating,title
0,688,4.706212,The Bridges of Madison County
1,146599,4.643835,Once Upon a Time... When We Were Colored
2,139408,4.640522,The Stars Fell on Henrietta
3,10858,4.637478,Nixon
4,16420,4.609466,Othello
5,451,4.60314,Leaving Las Vegas
6,34444,4.585038,Even Cowgirls Get the Blues
7,9087,4.573957,The American President
8,4584,4.571015,Sense and Sensibility
9,524,4.549924,Casino


In [15]:
data = []
ratings_test = pd.merge(ratings_test, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')

for index, row in ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    predicted = estimate(user_id, movie_metadata[movie_metadata['id'] == movie_id])[0][1]
    data.append((rating, predicted))

In [16]:
mse, rmse = calculate_mse_and_rmse(data)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 1.1192448209297041
RMSE: 1.0579436756886937


In [40]:
user_ids = ratings_test['userId'].unique()
ratings_test = pd.merge(ratings_test, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
total, hit = 0, 0
for user_id in user_ids:
    first_row = get_recommendations(user_id).iloc[0] 
    predicted_rating = first_row['predicted_rating'] 
    total = total + 1
    if predicted_rating > 3.5:
        hit = hit + 1
print("HitRatio: ", hit / total)
print("Hit:", hit)
print("Total: ", total)

  ratings_test = pd.merge(ratings_test, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')


HitRatio:  0.8757668711656442
Hit: 571
Total:  652
