In [1]:
import pandas as pd

# Read the dataset into a pandas DataFrame
def read_dataset() -> pd.DataFrame:
    return pd.read_csv("./TMDB_movie_dataset_v11.csv")

def clean_dataset(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe['overview'] = dataframe['overview'].fillna('')
    return dataframe.drop_duplicates(subset='title', keep='first', inplace=True)

pd.set_option('display.max_rows', 500)

# Read the dataset into a pandas DataFrame and clean it
dataframe = read_dataset()
dataframe.drop_duplicates(subset='title', keep='first', inplace=True)

# Fill missing values for the columns we are interested in
dataframe['overview'] = dataframe['overview'].fillna('')
dataframe['genres'] = dataframe['genres'].fillna('')
dataframe['keywords'] = dataframe['keywords'].fillna('')
dataframe['original_language'] = dataframe['original_language'].fillna('')


In [2]:
print(dataframe.columns)
print(dataframe.head(1)['keywords'])


Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')
0    rescue, mission, dream, airplane, paris, franc...
Name: keywords, dtype: object


In [3]:
# Combine features into a single string including vote average and vote count separated by a space
dataframe['combined_features'] = dataframe['overview'] + ' ' + dataframe['genres'] + ' ' + dataframe['keywords'] + ' ' + dataframe['original_language'] + ' ' + dataframe['vote_average'].astype(str) + ' ' + dataframe['vote_count'].astype(str)
print(dataframe.head(1)['combined_features'].values[0])

Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious. Action, Science Fiction, Adventure rescue, mission, dream, airplane, paris, france, virtual reality, kidnapping, philosophy, spy, allegory, manipulation, car crash, heist, memory, architecture, los angeles, california, dream world, subconscious en 8.364 34495


In [4]:
# Use TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(dataframe['combined_features'])

# Check the shape of the TF-IDF matrix
print(tfidf_matrix.shape)

(898302, 387358)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


: 