In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

movies = pd.read_csv(r"C:\Users\zaids\Downloads\movies_metadata.csv\movies_metadata.csv",low_memory= False)
ratings = pd.read_csv(r"C:\Users\zaids\Downloads\ratings_small.csv\ratings_small.csv")



In [3]:
print(movies.columns)
print(ratings.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [4]:
print(movies['title'].dtype)
print(ratings['movieId'].dtype)

object
int64


In [5]:
movies['title'] = movies['title'].astype(str)
ratings['movieId'] = ratings['movieId'].astype(str)

In [6]:
print(movies['title'].dtype)
print(ratings['movieId'].dtype)

object
object


In [7]:
print("Dataset shape:", movies.shape)
print("\nColumns:", movies.columns.tolist())
print("\nFirst few rows:\n")
movies.head()

Dataset shape: (45466, 24)

Columns: ['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']

First few rows:



Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [8]:
# Clean and prepare the data
def clean_data(df):
    # Remove rows with missing essential data - use .copy() to avoid warning
    df = df.dropna(subset=['overview', 'title']).copy()
    
    # Convert genres from string to list
    def parse_genres(genre_str):
        try:
            genres = ast.literal_eval(genre_str)
            return [genre['name'] for genre in genres]
        except:
            return []
    
    df['genres_list'] = df['genres'].apply(parse_genres)
    df['genres_str'] = df['genres_list'].apply(lambda x: ' '.join(x))
    
    # Create a combined feature column
    df['combined_features'] = (
        df['overview'].fillna('') + ' ' + 
        df['genres_str'].fillna('') + ' ' + 
        df['tagline'].fillna('')
    )
    
    return df

movies_clean = clean_data(movies)
print(f"Cleaned dataset shape: {movies_clean.shape}")


Cleaned dataset shape: (44512, 27)


In [9]:
class MovieRecommender:
    def __init__(self, movies_df, max_features=1000):
        self.movies = movies_df.reset_index(drop=True)
        self.tfidf_vectorizer = None
        self.tfidf_matrix = None
        self.max_features = max_features
        self._build_model()
    
    def _build_model(self):
        # Use fewer features to reduce memory usage
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=self.max_features,  # Reduced from 5000
            stop_words='english',
            ngram_range=(1, 1),  # Only unigrams
            min_df=2,  # Ignore terms that appear in less than 2 documents
            max_df=0.8  # Ignore terms that appear in more than 80% of documents
        )
        
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(
            self.movies['combined_features']
        )
        print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        print("Model built successfully!")
    
    def get_recommendations(self, title, num_recommendations=10):
        # Find the movie index
        try:
            idx = self.movies[self.movies['title'].str.lower() == title.lower()].index[0]
        except IndexError:
            print(f"Movie '{title}' not found in the dataset.")
            return self._search_similar_titles(title)
        
        # Calculate similarity only for the selected movie (not full matrix)
        movie_vector = self.tfidf_matrix[idx:idx+1]
        cosine_similarities = cosine_similarity(movie_vector, self.tfidf_matrix).flatten()
        
        # Get top similar movies
        similar_indices = cosine_similarities.argsort()[::-1][1:num_recommendations+1]
        
        # Return recommended movies with details
        recommendations = self.movies.iloc[similar_indices][
            ['title', 'genres_str', 'vote_average', 'overview', 'release_date']
        ].copy()
        
        recommendations['similarity_score'] = cosine_similarities[similar_indices]
        
        return recommendations
    
    def _search_similar_titles(self, title):
        # Search for similar movie titles
        similar_titles = self.movies[
            self.movies['title'].str.contains(title, case=False, na=False)
        ]['title'].head(5).tolist()
        
        if similar_titles:
            print(f"Did you mean one of these movies?")
            for i, movie in enumerate(similar_titles, 1):
                print(f"{i}. {movie}")
        else:
            print("No similar titles found.")
        
        return pd.DataFrame()

# Initialize the memory-efficient recommendation system
recommender = MovieRecommender(movies_clean, max_features=1000)


TF-IDF matrix shape: (44512, 1000)
Model built successfully!


In [10]:
class OnDemandRecommender:
    def __init__(self, movies_df):
        self.movies = movies_df.reset_index(drop=True)
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=2000,
            stop_words='english',
            ngram_range=(1, 1)
        )
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(
            self.movies['combined_features']
        )
        print("Model initialized without pre-computing similarity matrix!")
    
    def get_recommendations(self, title, num_recommendations=10):
        try:
            idx = self.movies[self.movies['title'].str.lower() == title.lower()].index[0]
        except IndexError:
            print(f"Movie '{title}' not found.")
            return pd.DataFrame()
        
        # Calculate similarity on-demand for just this movie
        query_vector = self.tfidf_matrix[idx]
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top recommendations
        top_indices = similarities.argsort()[-num_recommendations-1:-1][::-1]
        
        recommendations = self.movies.iloc[top_indices].copy()
        recommendations['similarity_score'] = similarities[top_indices]
        
        return recommendations[['title', 'genres_str', 'vote_average', 'similarity_score']]

# Use the on-demand recommender
efficient_recommender = OnDemandRecommender(movies_clean)


Model initialized without pre-computing similarity matrix!


In [None]:
# Test with a popular movie
print("=== Testing Recommendations ===")
User_Movie_Response = input("Enter Movie Name")
recommendations = efficient_recommender.get_recommendations(User_Movie_Response , num_recommendations=5)

if not recommendations.empty:
    print(f"\nMovies similar to '{User_Movie_Response}':")
    for idx, row in recommendations.iterrows():
        print(f"• {row['title']} (Similarity: {row['similarity_score']:.3f})")
        print(f"  Genres: {row['genres_str']}")
        print(f"  Rating: {row['vote_average']}/10\n")


=== Testing Recommendations ===


In [12]:
def clean_and_filter_data():
    pass