In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests

file_name = 'tmdb-movies.csv'
url = f"https://raw.githubusercontent.com/aczzi/similar-movies/refs/heads/main/{file_name}"
response = requests.get(url)
column_keys = ['original_title','director','keywords','genres', 'release_year']

if response.status_code == 200:
    with open(file_name, 'wb') as file:
        file.write(response.content)
else:
    raise Exception('Failed to download file')
file_path = os.path.join(os.getcwd(), file_name)
df = pd.read_csv(file_path)

def combine(row):
  # creating new column in dataframe containing column_keys
  return "#".join([str(row[column]) for column in column_keys])

def movie_dataframe(index):
  return pd.DataFrame(df[df.index==index], columns=column_keys)

def pretty_print(index):
  ldf = movie_dataframe(index)
  if ldf.empty:
    raise Exception('Movie not found')
  return " ".join([f"{column}: {ldf[column].values[0]}\n" for column in column_keys])

def get_movie_by_title(input):
    ldf = df[df['title'] == input.lower()]
    if ldf.empty:
      raise Exception('Movie not found')
    return ldf.index[0]

#cleanning data
def title_lower(row):
  return row["original_title"].lower()

for column in df.columns.tolist():
  df[column] = df[column].fillna('')

df['title'] = df.apply(title_lower, axis=1)

print(f"{len(df)} Movies loaded")

df['combined'] = df.apply(combine, axis=1)
vector = CountVectorizer()
matrix = vector.fit_transform(df['combined'])
# normalized combined vectorization
# Coords	Values
# (0, 7900)	1
# (0, 16466)	1
# (0, 3072)	1
# (0, 15407)	1
cosine = cosine_similarity(matrix)
# applying sklearn cosine_similarity
# create for each item a cosinus list
# [[1.         0.31311215 0.26666667 ... 0.         0.0860663  0.        ]]

print("Cosine loaded")

10866 Movies loaded
Cosine loaded


In [24]:
print("Choose a movie title:")
user_input = input()

try:
  movie_index = get_movie_by_title(user_input)
except Exception as e:
  movie_index = None
  print(f"Movie not found : {e}")

print("")

if movie_index is not None:
  similar=list(enumerate(cosine[movie_index]))
  sorted_movies=sorted(similar, key=lambda x:x[1],reverse=True)[0:11]

  print(pretty_print(sorted_movies[0][0])) #movie itself
  print('Top 10 Similar movies found:')
  for movie in sorted_movies[1:11]:
    print(pretty_print(movie[0]))

Choose a movie title:
alien

original_title: Alien
 director: Ridley Scott
 keywords: android|self-destruction|countdown|space marine|space suit
 genres: Horror|Action|Thriller|Science Fiction
 release_year: 1979

Top 10 Similar movies found:
original_title: Moonraker
 director: Lewis Gilbert
 keywords: venice|mass murder|space marine|space suit|marcus square
 genres: Action|Adventure|Thriller|Science Fiction
 release_year: 1979

original_title: Aliens
 director: James Cameron
 keywords: android|extraterrestrial technology|space marine|spaceman|cryogenics
 genres: Horror|Action|Thriller|Science Fiction
 release_year: 1986

original_title: Lifeforce
 director: Tobe Hooper
 keywords: space marine|vampire|flying saucer|comet|alien
 genres: Action|Science Fiction|Thriller|Horror|Fantasy
 release_year: 1985

original_title: Planet of the Apes
 director: Tim Burton
 keywords: gorilla|space marine|space suit|revolution|chimp
 genres: Thriller|Science Fiction|Action|Adventure
 release_year: 20