In [1]:
import zipfile
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
zip_path = 'archive (1).zip'

# open
with zipfile.ZipFile(zip_path, 'r') as z:
    csv_filename = 'tmdb_5000_movies.csv'
    
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)

print(df.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [16]:
df['overview'] = df['overview'].fillna('')

In [17]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# TF-IDF matrix 
tfidf_matrix = vectorizer.fit_transform(df['overview'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (4803, 20978)


In [11]:
def recommend_movies(query, vectorizer, tfidf_matrix, df, top_n=5):
    """
    Compute cosine similarity between the query and each movie overview, 
    and return the top N recommended movies with their similarity scores.
    
    Args:
        query (str): User input describing movie preferences.
        vectorizer: Fitted TF-IDF vectorizer.
        tfidf_matrix: TF-IDF matrix of movie overviews.
        df (pd.DataFrame): Original movie DataFrame.
        top_n (int): Number of top recommendations to return.
        
    Returns:
        pd.DataFrame: DataFrame containing recommended movie titles and similarity scores.
    """
    # Transform the query to TF-IDF vector space
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get indices of top N movies
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    
    # recommendations DataFrame
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity'] = cosine_sim[top_indices]
    
    return recommendations[['title', 'similarity']]

In [13]:
# Example
query = "I love thrilling action movies set in space, with a comedic twist."

recommendations = recommend_movies(query, vectorizer, tfidf_matrix, df, top_n=5)

print("Recommendations for query:", query)
display(recommendations)

Recommendations for query: I love thrilling action movies set in space, with a comedic twist.


Unnamed: 0,title,similarity
880,Grindhouse,0.179283
603,Hard Rain,0.157646
1054,Scary Movie 2,0.156413
1709,Space Pirate Captain Harlock,0.153771
4453,Crying with Laughter,0.14972


In [19]:
# Example
query = "22nd century"

recommendations = recommend_movies(query, vectorizer, tfidf_matrix, df, top_n=5)

print("Recommendations for query:", query)
display(recommendations)

Recommendations for query: 22nd century


Unnamed: 0,title,similarity
0,Avatar,0.364566
634,The Matrix,0.362916
1341,The Inhabited Island,0.339946
775,Supernova,0.259156
1274,Just Visiting,0.198323


In [21]:
df[df['original_title']=='The Matrix'].get('overview')

634    Set in the 22nd century, The Matrix tells the ...
Name: overview, dtype: object