#### Dumb recommender system using TfidfVectorizer, build numerical representation from text and use cosine similarity to recommend similar characteristics

In [4]:
import pandas as pd

df = pd.read_csv('../data/tmdb_5000_movies.csv')
print(df.shape)

(4803, 20)


In [5]:
print(df.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [7]:
# For each movie, compose a big string with all the information, prepared for tokenization
import json

inputs = []

def processDataMovie(rowObj):
  row_string = ""
  for key, value in rowObj.items():
    if key != "genres" and key != "keywords":
      continue
    if isinstance(value, str):
      try:
        valueIt = json.loads(value)
        for mov in valueIt:
          for k, v in mov.items():
            if k == "id":
              continue
            else:
              row_string += f" {v} "
      except:
        row_string += f" {value} "
    else:
      row_string += f" {str(value)} "
    
  return row_string



for item in df.iterrows():
  inputs.append(processDataMovie(item[1]))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(inputs)
print(X.shape)

(4803, 2000)


In [9]:
movie2idx = pd.Series(df.index, index=df['title'])

In [10]:
# transform query to vectorize formation
queryTitleName = "Scream 3"
print(movie2idx[queryTitleName])

X_q = X[movie2idx[queryTitleName]]
print(df.iloc[movie2idx[queryTitleName]])

1164
budget                                                           40000000
genres                  [{"id": 27, "name": "Horror"}, {"id": 9648, "n...
homepage                                                              NaN
id                                                                   4234
keywords                [{"id": 2546, "name": "mask"}, {"id": 2573, "n...
original_language                                                      en
original_title                                                   Scream 3
overview                A murdering spree begins to happen again, this...
popularity                                                      24.992057
production_companies    [{"name": "Konrad Pictures", "id": 85}, {"name...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2000-02-03
revenue                                                         161834276
runtime                          

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cosine_sim = cosine_similarity(X, X_q)

non_zero_indices = np.nonzero(cosine_sim != 0)[0]
non_zero_elements = cosine_sim[non_zero_indices].flatten()
sorted_order = np.argsort(-non_zero_elements)
ranked_indices = non_zero_indices[sorted_order][1:6]


In [12]:
# Display movies name
for idx in ranked_indices:
  movie = df.iloc[idx]
  print(movie['title'])

The Calling
Felicia's Journey
Friday the 13th: A New Beginning
Mindhunters
The Glimmer Man


In [13]:
def recommendMovie(title):
  # transform query to vectorize formation
  X_q = X[movie2idx[title]]

  cosine_sim = cosine_similarity(X, X_q)
  non_zero_indices = np.nonzero(cosine_sim != 0)[0]
  non_zero_elements = cosine_sim[non_zero_indices].flatten()
  sorted_order = np.argsort(-non_zero_elements)
  ranked_indices = non_zero_indices[sorted_order][1:6]

  for idx in ranked_indices:
    movie = df.iloc[idx]
    print(movie['title'])


In [14]:
recommendMovie("Interstellar")

Silent Running
Armageddon
About Time
The Astronaut's Wife
Lost in Space
