In [387]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import json
from sklearn.metrics.pairwise import euclidean_distances    

In [388]:
movies_data = pd.read_csv('tmdb_5000_movies.csv')
movies_data.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [389]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [390]:
movies_data["genres"]= movies_data["genres"].apply(lambda x: json.loads(x))
movies_data["keywords"]= movies_data["keywords"].apply(lambda x: json.loads(x))


In [391]:
# Remove row if   keyword and genres are empty
movies_data=movies_data[movies_data[["genres","keywords"]].apply(lambda a:(len(a['genres'])>0) | (len(a['keywords']) >0) ,axis=1)]

In [392]:
def get_genres_keywords(data):
    words = []
    genres = data["genres"]
    keywords = data["keywords"]
    genres_keywords = genres + keywords 
    for value in genres_keywords:
        words.append(value["name"])
    
    return " ".join(words)


In [393]:
movies_data["keywords_genres"] = movies_data.apply(get_genres_keywords,axis=1)

In [394]:
vectorizer = TfidfVectorizer()
vector_data= vectorizer.fit_transform(movies_data["keywords_genres"]).toarray()
test_movie_data =  vectorizer.transform([movies_data["keywords_genres"][0]]).toarray()
vectorizer.get_feature_names_out()


array(['11', '15th', '16th', ..., '卧底肥妈', '绝地奶霸', '超级妈妈'], dtype=object)

In [395]:


nearest_movies = [x[0] for x in euclidean_distances(vector_data, test_movie_data)]
nearest_movies= pd.DataFrame(nearest_movies,columns=["distance"])
nearest_movies["Moive_index"] = movies_data.index

nearest_movies.sort_values(by="distance",inplace=True)
nearest_movies = nearest_movies.iloc[:5,:]


In [396]:
# recommended Movies based on the test movie
movies_data.iloc[list(nearest_movies["Moive_index"])]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords_genres
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction cultu...
278,100000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 878, '...",,869,"[{'id': 690, 'name': 'gorilla'}, {'id': 1826, ...",en,Planet of the Apes,After a spectacular crash-landing on an unchar...,51.188633,"[{""name"": ""Twentieth Century Fox Film Corporat...",...,2001-07-25,362211740,119.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,You'll be sorry you were ever born human,Planet of the Apes,5.6,1243,Thriller Science Fiction Action Adventure gori...
2403,18500000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",,679,"[{'id': 803, 'name': 'android'}, {'id': 1603, ...",en,Aliens,When Ripley's lifepod is found by a salvage cr...,67.66094,"[{""name"": ""Twentieth Century Fox Film Corporat...",...,1986-07-18,183316455,137.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,This Time It's War,Aliens,7.7,3220,Horror Action Thriller Science Fiction android...
838,50000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,8077,"[{'id': 378, 'name': 'prison'}, {'id': 803, 'n...",en,Alien³,After escaping with Newt and Hicks from the al...,45.856409,"[{""name"": ""Twentieth Century Fox Film Corporat...",...,1992-05-22,159773545,114.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The bitch is back.,Alien³,6.2,1633,Science Fiction Action Horror prison android s...
541,75000000,"[{'id': 28, 'name': 'Action'}, {'id': 10752, '...",http://www.wb-soldier.com/,9425,"[{'id': 1826, 'name': 'space marine'}, {'id': ...",en,Soldier,Sergeant Todd is a veteran soldier for an elit...,11.873856,"[{""name"": ""Impact Pictures"", ""id"": 248}, {""nam...",...,1998-10-23,14567883,99.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Left for dead on a remote planet for obsolete ...,Soldier,6.1,221,Action War Science Fiction space marine dystop...
