In [62]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests

file_name = 'tmdb-movies.csv'
url = f"https://raw.githubusercontent.com/aczzi/similar-movies/refs/heads/main/{file_name}"
response = requests.get(url)

if response.status_code == 200:
    with open(file_name, 'wb') as file:
        file.write(response.content)
else:
    raise Exception('Failed to download file')
file_path = os.path.join(os.getcwd(), file_name)
df = pd.read_csv(file_path)

for column in df.columns.tolist():
  df[column] = df[column].fillna('')

def title_lower(row):
  return row["original_title"].lower()

df['title'] = df.apply(title_lower, axis=1)

print("Data is loaded")

Data is loaded


In [65]:
#cleanning data
column_keys = ['original_title','director','keywords','genres', 'release_year'] # vector dimensions

def combine(row):
  # creating new column in dataframe containing column_keys
  return "#".join([str(row[column]) for column in column_keys])

def movie_dataframe(index):
  return pd.DataFrame(df[df.index==index], columns=column_keys)

def pretty_print(index):
  ldf = movie_dataframe(index)
  if ldf.empty:
    raise Exception('Movie not found')
  return " ".join([f"{column}: {ldf[column].values[0]}\n" for column in column_keys])

def get_index_from_title(title):
    ldf = df[df['title'] == title.lower()]
    if ldf.empty:
      raise Exception('Movie not found')
    return ldf.index[0]

df['combined'] = df.apply(combine, axis=1)
vector = CountVectorizer()
matrix = vector.fit_transform(df['combined'])
# normalized combined vectorization
# Coords	Values
# (0, 7900)	1
# (0, 16466)	1
# (0, 3072)	1
# (0, 15407)	1
cosine = cosine_similarity(matrix)
#applying sklearn cosine_similarity
# create for each item a cosinus list
# [[1.         0.31311215 0.26666667 ... 0.         0.0860663  0.        ]]

In [73]:
print("Choose a movie title:")
user_input = input()

try:
  movie_index = get_index_from_title(user_input)
except Exception as e:
  movie_index = None
  print(f"Movie not found : {e}")

print("")

if movie_index is not None:
  similar=list(enumerate(cosine[movie_index]))
  sorted_movies=sorted(similar, key=lambda x:x[1],reverse=True)[0:10]  #sorting top ten movies

  print(pretty_print(sorted_movies[0][0]))
  print('Top 10 Similar movies found:')
  for movie in sorted_movies[1:10]:
    print(pretty_print(movie[0]))

Choose a movie title:
Batman

original_title: Batman
 director: Tim Burton
 keywords: double life|dc comics|dual identity|chemical|crime fighter
 genres: Fantasy|Action
 release_year: 1989

Top 10 Similar movies found:
original_title: Batman Returns
 director: Tim Burton
 keywords: holiday|corruption|double life|dc comics|crime fighter
 genres: Action|Crime|Fantasy|Science Fiction|Thriller
 release_year: 1992

original_title: Batman & Robin
 director: Joel Schumacher
 keywords: double life|dc comics|dual identity|crime fighter|fictional place
 genres: Action|Crime|Fantasy|Science Fiction
 release_year: 1997

original_title: Batman Begins
 director: Christopher Nolan
 keywords: martial arts|dc comics|crime fighter|secret identity|undercover
 genres: Action|Crime|Drama
 release_year: 2005

original_title: The Dark Knight
 director: Christopher Nolan
 keywords: dc comics|crime fighter|secret identity|scarecrow|sadism
 genres: Drama|Action|Crime|Thriller
 release_year: 2008

original_title