In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

# Load Data

In [2]:
df_links = pd.read_csv('../data/ml-latest-small/links.csv')
df_movies = pd.read_csv('../data/ml-latest-small/movies.csv')
df_ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
df_tags = pd.read_csv('../data/ml-latest-small/tags.csv')

In [3]:
df_links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


These IDs are not useful for clustering. They're arithmetic difference is not indicative of anything meaningful.

In [4]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


The genres column needs to be split up. There should be a column for each genre and a 1 to indicate the movie falls in that genre and a 0 to indicate it doesn't.

# Transform Data

In [5]:
genres = []
d_genres = dict()
for index, row in df_movies.iterrows():
    genre = row['genres']
    movieid = row['movieId']
    split = genre.split('|')
    d_genres[movieid] = split
    for s in split:
        if s not in genres:
            genres.append(s)
genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'War',
 'Musical',
 'Documentary',
 'IMAX',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [6]:
df_complete = df_movies['movieId'].copy()
g = [[0]*len(genres) for i in range(len(df_complete))]
for index,row in df_movies.iterrows():
    movieId = row['movieId']
    current_genres = d_genres[movieId]
    for genre in current_genres:
        i = genres.index(genre)
        g[index][i] = 1
df_complete = pd.concat([df_complete,pd.DataFrame(g,columns=genres)],axis=1)
df_complete

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0,1,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9738,193583,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,193585,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


The ratings need to be coalesced to get an average rating for each movie. This can be merged with the movie dataframe to add more data to cluster on.

In [8]:
len(df_ratings['movieId'].unique())

9724

There are 9742 movies and only 9724 rated movies so the 18 unrated movies will need to be dropped.

In [9]:
d_ratings = dict()
for index, row in df_ratings.iterrows():
    movieid = row['movieId']
    rating = row['rating']
    if movieid in d_ratings:
        d_ratings[movieid].append(rating)
    else:
        d_ratings[int(movieid)] = [rating]
ratings = []
for key in d_ratings.keys():
    ratings.append(round(sum(d_ratings[key])/len(d_ratings[key]),2))
ids = list(d_ratings.keys())
df_temp = pd.DataFrame(list(zip(ids,ratings)),columns=['movieId','rating'])
df_temp

Unnamed: 0,movieId,rating
0,1,3.92
1,3,3.26
2,6,3.95
3,47,3.98
4,50,4.24
...,...,...
9719,160341,2.50
9720,160527,4.50
9721,160836,3.00
9722,163937,3.50


I tried many different ways to use pd.merge to merge the dataframes by movieId and it always returned the df_compete dataframe unchanged. This is a less correct but working way to remove all the ids that are not shared by both dataframes and then merge the ratings into df_complete

In [10]:
ratings_ids = df_temp['movieId']
complete_ids = df_complete['movieId']
ratings_ids_to_remove = []
complete_ids_to_remove = []
for ri in ratings_ids:
    if ri not in complete_ids:
        ratings_ids_to_remove.append(ri)
for ci in complete_ids:
    if ci not in ratings_ids:
        complete_ids_to_remove.append(ci)

I tried using pandas merge but it only ever returned the first dataframe so I came up with this solution. Having done more research since I know that another way to do it now would be to use insert based on the movieId and then drop the rows with NaN values but this worked so I left it.

In [11]:
for ri in ratings_ids_to_remove:
    df_temp = df_temp.where(df_temp.movieId != ri)
df_temp = df_temp.dropna()
df_temp

Unnamed: 0,movieId,rating
0,1.0,3.92
1,3.0,3.26
2,6.0,3.95
3,47.0,3.98
4,50.0,4.24
...,...,...
9568,6095.0,3.50
9569,7202.0,3.00
9570,7562.0,2.50
9571,7984.0,4.00


In [12]:
for ci in complete_ids_to_remove:
    df_complete = df_complete.where(df_complete.movieId != ci)
df_complete = df_complete.dropna()
df_complete['rating'] = df_temp['rating']
df_complete

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),rating
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92
1,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.26
2,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.95
3,4.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.98
4,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5396,9004.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.38
5397,9005.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.50
5398,9008.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.25
5399,9010.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.33


In [13]:
df_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


Make all the tags lowercase

In [14]:
tags = df_tags.tag.unique()
t_ids = df_tags.movieId.unique()
comp_ids = df_complete.movieId.unique()
shared_ids = 0
for i in range(len(t_ids)):
    if float(t_ids[i]) in comp_ids:
        shared_ids += 1
print(len(tags),len(t_ids), shared_ids)

1589 1572 1261


There are 1475 unique tags for 1572. This is probably not a good set to use because it is likely sparsely connected.

# Build Clustering Model

I chose to use a very low min_sample value with the DBScan cluster model because movies vary very widely and it would make sense that there should be a large number of clusters.

In [72]:
df_complete = df_complete.replace([np.inf, -np.inf], np.nan).dropna()
data = np.array(df_complete.drop(columns=['movieId']))
model = DBSCAN(min_samples=5)
model.fit(data)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)

# Test Model

In [73]:
print(model.labels_)
toy_story_class = model.labels_[0]
toy_story_class

[-1  0  1 ... 56 11 -1]


-1

In [74]:
non_toy_story_labels = model.labels_[1:]
cluster_values = np.where(non_toy_story_labels == toy_story_class)
first_class_index = cluster_values[0][0] + 1
print(first_class_index)
movie_id = df_complete['movieId'][first_class_index]
print(movie_id)
df_movies.loc[df_movies['movieId'] == movie_id]

12
13.0


Unnamed: 0,movieId,title,genres
12,13,Balto (1995),Adventure|Animation|Children


For someone who liked Toy Story my clustering model has recommend watching Balto.