In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
movie_metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
movie_metadata.shape

(45466, 24)

In [4]:
ratings = pd.read_csv('data/ratings.csv', low_memory=False)
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [5]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

movie_metadata = movie_metadata.drop([19730, 29503, 35587])

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movie_metadata['id'] = movie_metadata['id'].astype('int')

movie_metadata = movie_metadata.merge(credits, on='id')
movie_metadata = movie_metadata.merge(keywords, on='id')

In [6]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

movie_metadata['director'] = movie_metadata['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(get_list)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,id,title,cast,director,keywords,genres
0,862,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,8844,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,31357,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [7]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    movie_metadata[feature] = movie_metadata[feature].apply(clean_data)

movie_metadata[['id', 'title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,id,title,cast,director,keywords,genres
0,862,Toy Story,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,8844,Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,15602,Grumpier Old Men,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,31357,Waiting to Exhale,"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,11862,Father of the Bride Part II,"[stevemartin, dianekeaton, martinshort]",charlesshyer,"[baby, midlifecrisis, confidence]",[comedy]


In [8]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

movie_metadata['soup'] = movie_metadata.apply(create_soup, axis=1)
movie_metadata[['soup']].head()

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence stevemartin dian...


In [9]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_metadata['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [10]:
movie_metadata = movie_metadata.reset_index()
indices = pd.Series(movie_metadata.index, index=movie_metadata['id'])
indices = indices.sort_values(ascending=False)

In [11]:
def get_recommendations_single_movie(id, rating, user_ratings, cosine_sim=cosine_sim):
    idx = indices[id]
    reviewed_movie_indices = user_ratings['movieId'].tolist()
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, score * rating) for i, score in sim_scores if i not in reviewed_movie_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    movie_metadata1 = movie_metadata[['id', 'title']].iloc[movie_indices]
    sim_scores_values = [i[1] for i in sim_scores]
    movie_metadata1['similarity_score'] = sim_scores_values
    return movie_metadata1


In [12]:
def get_recommendations(user_id):
    pred_series= []
    user_ratings = ratings.loc[ratings['userId'] == user_id]
    merged_ratings = pd.merge(user_ratings, movie_metadata[['id']], left_on='movieId', right_on='id', how='inner')
    top_20_user_ratings = merged_ratings.sort_values(by='rating', ascending=False).head(20)
    top_20_user_ratings['movieId'] = top_20_user_ratings['movieId'].astype(int)
    top_20_movie_ids = top_20_user_ratings[['movieId', 'rating']]
    for index, row in top_20_movie_ids.iterrows():
        movie_id =row['movieId']
        rating = row['rating']
        pred_series.append(get_recommendations_single_movie(movie_id, rating, user_ratings))

    df = pd.concat(pred_series)
    return df.sort_values(by='similarity_score', ascending=False).head(10)

In [13]:
get_recommendations(1)

Unnamed: 0,id,title,similarity_score
2316,1367,Rocky II,3.5
2318,1374,Rocky IV,3.5
2317,1371,Rocky III,3.5
29547,26569,The 7 Grandmasters,3.27395
2319,1375,Rocky V,3.162278
1865,1366,Rocky,3.162278
27893,46897,The Mark of Cain,2.886751
2331,9489,You've Got Mail,2.635231
32405,351964,The Escort,2.5
442,50463,The Favor,2.390457
