In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns

ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])


In [8]:
def weighted_average_score(df, k=0.8):
    n_views = df.groupby('movie_id', sort=False).movie_id.count()
    ratings = df.groupby('movie_id', sort=False).rating.mean()
    scores = ((1-k)*(n_views/n_views.max()) + 
              k*(ratings/ratings.max())).to_numpy().argsort()[::-1]
    df_deduped = df.groupby('movie_id', sort=False).agg({'title':'first', 
                                                         'genres':'first', 
                                                         'rating':'mean'})
    return df_deduped.assign(views=n_views).iloc[scores]

In [9]:
df = movies.merge(ratings).merge(users)
weighted_average_score(df).head(10)

Unnamed: 0_level_0,title,genres,rating,views
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2858,American Beauty (1999),Comedy|Drama,4.272414,580
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,4.451143,481
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,4.325153,489
527,Schindler's List (1993),Drama|War,4.552846,369
2028,Saving Private Ryan (1998),Action|Drama|War,4.405034,437
1198,Raiders of the Lost Ark (1981),Action|Adventure,4.454545,407
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.361364,440
318,"Shawshank Redemption, The (1994)",Drama,4.531609,348
593,"Silence of the Lambs, The (1991)",Drama|Thriller,4.33012,415
2762,"Sixth Sense, The (1999)",Thriller,4.394737,380


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
s = "Animation Children's Comedy"
tf_wrong = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
tf_wrong.fit([s])
tf_wrong.get_feature_names()
# ['animation', 'animation children', 'children', 'children comedy', 'comedy']

['animation', 'animation children', 'children', 'children comedy', 'comedy']

In [4]:
[c for i in range(1,2) for c in combinations(s.split(), r=i)]

[('Animation',), ("Children's",), ('Comedy',)]

In [5]:
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                                             for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(3883, 353)

In [6]:
pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(5, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,"(Comedy, Drama, War)","(Action, Romance, War)","(Adventure, Comedy, War)","(Action, Children's, Fantasy)","(Animation, Children's, Thriller)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Annie Hall (1977),0.0,0.0,0.0,0.0,0.0
"Fast, Cheap & Out of Control (1997)",0.0,0.0,0.0,0.0,0.0
Dances with Wolves (1990),0.0,0.0,0.0,0.0,0.0
Blood Simple (1984),0.0,0.0,0.0,0.0,0.0
"Commitments, The (1991)",0.0,0.0,0.0,0.0,0.0
Harriet the Spy (1996),0.0,0.0,0.0,0.0,0.0
Quatermass II (1957),0.0,0.0,0.0,0.0,0.0
"I Love You, I Love You Not (1996)",0.0,0.0,0.0,0.0,0.0
Never Talk to Strangers (1995),0.0,0.0,0.0,0.0,0.0
Free Willy 2: The Adventure Home (1995),0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [11]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(5, axis=1).round(2)

Shape: (3883, 3883)


title,Night on Earth (1991),Cyrano de Bergerac (1990),King Kong (1933),Crocodile Dundee II (1988),Big Fella (1937)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.08,0.00,0.00,0.05,0.00
Jumanji (1995),0.00,0.00,0.07,0.14,0.00
Grumpier Old Men (1995),0.18,0.15,0.00,0.13,0.00
Waiting to Exhale (1995),1.00,0.06,0.00,0.14,0.09
Father of the Bride Part II (1995),0.45,0.00,0.00,0.32,0.00
...,...,...,...,...,...
Meet the Parents (2000),0.45,0.00,0.00,0.32,0.00
Requiem for a Dream (2000),0.39,0.16,0.00,0.00,0.23
Tigerland (2000),0.39,0.16,0.00,0.00,0.23
Two Family House (2000),0.39,0.16,0.00,0.00,0.23


In [12]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

In [13]:
movies[movies.title.eq('2001: A Space Odyssey (1968)')]

Unnamed: 0,movie_id,title,genres
912,924,2001: A Space Odyssey (1968),Drama|Mystery|Sci-Fi|Thriller


In [14]:
genre_recommendations('2001: A Space Odyssey (1968)', cosine_sim_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,"X-Files: Fight the Future, The (1998)",Mystery|Sci-Fi|Thriller
1,"Client, The (1994)",Drama|Mystery|Thriller
2,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
3,Communion (1989),Drama|Sci-Fi|Thriller
4,Gattaca (1997),Drama|Sci-Fi|Thriller
5,"Thirteenth Floor, The (1999)",Drama|Sci-Fi|Thriller
6,Event Horizon (1997),Action|Mystery|Sci-Fi|Thriller
7,2010 (1984),Mystery|Sci-Fi
8,Stalker (1979),Mystery|Sci-Fi
9,Deep Impact (1998),Action|Drama|Sci-Fi|Thriller
