In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/metadata_clean.csv', index_col=0)
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,Animation,81.0,7.7,5415.0,1995.0
0,Toy Story,Comedy,81.0,7.7,5415.0,1995.0
0,Toy Story,Family,81.0,7.7,5415.0,1995.0
1,Jumanji,Adventure,104.0,6.9,2413.0,1995.0
1,Jumanji,Fantasy,104.0,6.9,2413.0,1995.0


In [3]:
orig_df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

In [4]:
df[['overview', 'id']] = orig_df[['overview', 'id']]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,Animation,81.0,7.7,5415.0,1995.0,"Led by Woody, Andy's toys live happily in his ...",862
0,Toy Story,Comedy,81.0,7.7,5415.0,1995.0,"Led by Woody, Andy's toys live happily in his ...",862
0,Toy Story,Family,81.0,7.7,5415.0,1995.0,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,Adventure,104.0,6.9,2413.0,1995.0,When siblings Judy and Peter discover an encha...,8844
1,Jumanji,Fantasy,104.0,6.9,2413.0,1995.0,When siblings Judy and Peter discover an encha...,8844


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfid = TfidfVectorizer(stop_words='english')

In [7]:
df['overview'] = df['overview'].fillna('')

In [30]:
df = df.drop_duplicates(subset=['id'])

In [31]:
tfid_matrix = tfid.fit_transform(df['overview'])
tfid_matrix.shape

(45436, 75827)

In [32]:
from sklearn.metrics.pairwise import linear_kernel

In [33]:
tfid_matrix[:2]

<2x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [34]:
cosine_sim = linear_kernel(tfid_matrix, tfid_matrix)

In [35]:
indicies = pd.Series(df.index, index=df['title'])

indicies = indicies.drop_duplicates()

indicies

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45436, dtype: int64

In [36]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indicies=indicies):
    idx = indicies[title]
    
    # print(idx)
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # print(sim_scores[:5])
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1])
    
    print(sim_scores)
    
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [37]:
content_recommender('The Lion King')

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (28, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (53, 0.0), (54, 0.0), (56, 0.0), (57, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.0), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.0), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.0), (80, 0.0), (81, 0.0), (82, 0.0), (85, 0.0), (87, 0.0), (90, 0.0), (91, 0.0), (92, 0.0), (93, 0.0), (95, 0.0), (96, 0.0), (98, 0.0), (99, 0.0), (100, 0.0), (101, 0.0), (102, 0.0), (103, 0.0), (104, 0.0), (106, 0.0), (107, 0.0), (108, 0.0), (109, 0.0)

1                         Jumanji
2                Grumpier Old Men
3               Waiting to Exhale
4     Father of the Bride Part II
5                            Heat
6                         Sabrina
8                    Sudden Death
9                       GoldenEye
10         The American President
11    Dracula: Dead and Loving It
Name: title, dtype: object