In [1]:
import pandas as pd
import numpy as np

In [2]:
movies=pd.read_csv('data/movie.csv')
ratings=pd.read_csv('data/rating.csv')
genome=pd.read_csv('data/genome_scores.csv')
tags=pd.read_csv('data/tag.csv')

In [3]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [4]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [5]:
genome.columns

Index(['movieId', 'tagId', 'relevance'], dtype='object')

In [6]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [7]:
a=ratings['userId'].value_counts()

In [8]:
b=ratings['userId'].value_counts()>200

In [9]:
new_users=list(a[b].index)

In [10]:
len(new_users)

26599

In [11]:
ratings=ratings[ratings['userId'].isin(new_users)]

In [12]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
541,7,3,3.0,2002-01-16 19:14:23
542,7,7,3.0,2002-01-16 19:10:20
543,7,11,4.0,2002-01-16 19:04:49


In [13]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [14]:
movies_refine = movies.copy()
movies_refine['year'] = ((movies_refine['title'].str.extract(r'\((....)\) *$'))[0].astype('float32'))
movies_refine = movies_refine.dropna()
movies_refine['year'] = movies_refine['year'].astype('int32')
movies_refine['title'] = (movies_refine['title'].str.extract(r'^(.*) \(....\) *$'))[0]
movies_refine['genres'] = movies_refine['genres'].str.split('|')

movies_refine

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
27273,131254,Kein Bund für's Leben,[Comedy],2007
27274,131256,"Feuer, Eis & Dosenbier",[Comedy],2002
27275,131258,The Pirates,[Adventure],2014
27276,131260,Rentun Ruusu,[(no genres listed)],2001


In [15]:
ratings_with_moviename = ratings.merge(movies_refine, on='movieId')

In [16]:
ratings_with_moviename.shape

(12426038, 7)

In [17]:
ratings_with_moviename.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,7,3,3.0,2002-01-16 19:14:23,Grumpier Old Men,"[Comedy, Romance]",1995
1,91,3,3.0,2005-03-22 22:43:12,Grumpier Old Men,"[Comedy, Romance]",1995
2,96,3,4.0,2013-06-13 02:18:47,Grumpier Old Men,"[Comedy, Romance]",1995
3,116,3,2.0,2005-11-23 06:40:58,Grumpier Old Men,"[Comedy, Romance]",1995
4,156,3,2.0,2002-12-02 03:53:45,Grumpier Old Men,"[Comedy, Romance]",1995


In [18]:
genre_count = dict()
for index, series in movies_refine.iterrows():
    for genre in series['genres']:
        genre_count[genre] = genre_count.get(genre, 0) + 1
print(genre_count)

{'Adventure': 2329, 'Animation': 1026, 'Children': 1139, 'Comedy': 8369, 'Fantasy': 1411, 'Romance': 4127, 'Drama': 13337, 'Action': 3518, 'Crime': 2938, 'Thriller': 4177, 'Horror': 2611, 'Mystery': 1513, 'Sci-Fi': 1740, 'IMAX': 196, 'Documentary': 2467, 'War': 1194, 'Musical': 1034, 'Western': 676, 'Film-Noir': 330, '(no genres listed)': 237}


In [19]:
len(genre_count)

20

In [20]:
movie_num_rating=ratings_with_moviename.groupby('title')['rating'].count().reset_index()

In [21]:
movie_num_rating.rename(columns={'rating':'num_ratings'},inplace=True)

In [22]:
movie_num_rating=movie_num_rating[movie_num_rating['num_ratings']>=50]

In [23]:
final_rating=ratings_with_moviename.merge(movie_num_rating,on='title')

In [24]:
final_rating

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_ratings
0,7,3,3.0,2002-01-16 19:14:23,Grumpier Old Men,"[Comedy, Romance]",1995,4665
1,91,3,3.0,2005-03-22 22:43:12,Grumpier Old Men,"[Comedy, Romance]",1995,4665
2,96,3,4.0,2013-06-13 02:18:47,Grumpier Old Men,"[Comedy, Romance]",1995,4665
3,116,3,2.0,2005-11-23 06:40:58,Grumpier Old Men,"[Comedy, Romance]",1995,4665
4,156,3,2.0,2002-12-02 03:53:45,Grumpier Old Men,"[Comedy, Romance]",1995,4665
...,...,...,...,...,...,...,...,...
12278100,134567,69088,3.5,2011-12-07 12:54:03,"War Game, The","[Documentary, Drama, War]",1965,51
12278101,137202,69088,4.0,2012-01-03 04:19:44,"War Game, The","[Documentary, Drama, War]",1965,51
12278102,137425,69088,3.0,2010-11-28 21:25:04,"War Game, The","[Documentary, Drama, War]",1965,51
12278103,137996,69088,4.0,2011-08-29 19:59:35,"War Game, The","[Documentary, Drama, War]",1965,51


In [25]:
final_rating=final_rating.drop_duplicates(subset=['userId','title'])

In [26]:
final_rating.shape

(12215635, 8)

In [27]:
movie_pivot=final_rating.pivot_table(columns='userId',index='title',values='rating')

In [28]:
movie_pivot.shape

(9777, 26599)

In [29]:
movie_pivot=movie_pivot.fillna(0)

In [30]:
movie_pivot

userId,7,11,14,24,31,53,54,58,69,82,...,138456,138457,138459,138464,138467,138472,138474,138477,138483,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Round Midnight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
'night Mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡Three Amigos!,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from scipy.sparse import csr_matrix
movie_matrix=csr_matrix(movie_pivot)
movie_matrix

<9777x26599 sparse matrix of type '<class 'numpy.float64'>'
	with 12215635 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.neighbors import NearestNeighbors

In [33]:
model=NearestNeighbors(algorithm='brute')

In [34]:
model.fit(movie_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [35]:
movie_pivot

userId,7,11,14,24,31,53,54,58,69,82,...,138456,138457,138459,138464,138467,138472,138474,138477,138483,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Round Midnight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
'night Mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡Three Amigos!,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
movie_pivot.iloc[237,:].values.reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

In [38]:
distance,suggestion=model.kneighbors(movie_pivot.iloc[1,:].values.reshape(1,-1),n_neighbors=5)

In [39]:
suggestion

array([[   1, 4075, 6757, 8655, 3695]], dtype=int64)

In [40]:
movie_pivot.index[184]

'Ace in the Hole (Big Carnival, The)'

In [47]:
for i in suggestion[0]:
    print(movie_pivot.index[i])

'Salem's Lot
Horrors of Spider Island (Ein Toter Hing im Netz)
Piranha 3DD (a.k.a. Piranha DD)
The Pumaman
Grudge 3, The


In [42]:
np.where(movie_pivot.index=='Ace in the Hole (Big Carnival, The)')[0][0]

184

In [43]:
def recommend(moviename):
    b_id=np.where(movie_pivot.index==moviename)[0][0]
    distance,suggestion=model.kneighbors(movie_pivot.iloc[b_id,:].values.reshape(1,-1),n_neighbors=5)
    for i in suggestion[0]:
        print(movie_pivot.index[i])

In [45]:
recommend('The Pumaman')

The Pumaman
Horrors of Spider Island (Ein Toter Hing im Netz)
Giant Spider Invasion, The
Mitchell
Beast of Yucca Flats, The
