# RECOMMENDATION ENGINE USING COSINE SIMILARITY

**IMPORTING THE LIBRARIES**

In [52]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
warnings.filterwarnings('ignore')


**READING IN THE DATA**

In [54]:
# 1. DATA LOADING AND PREPROCESSING

# Load the dataset
df = pd.read_csv(r'C:\Users\USER\Desktop\PDF\movies.csv')
print(f" Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns")
        


 Dataset loaded successfully with 9999 rows and 9 columns


**LIST OF THE DATA**

In [55]:

# Display column names for reference
print(f"üìã Available columns: {list(df.columns)}")


üìã Available columns: ['MOVIES', 'YEAR', 'GENRE', 'RATING', 'ONE-LINE', 'STARS', 'VOTES', 'RunTime', 'Gross']


**COPYING THE DATASET NEEDED**

In [56]:
# Select relevant columns for our recommendation system
# Assuming the dataset has columns: MOVIES, GENRE, and ONE-LINE
df = df[['MOVIES', 'GENRE', 'ONE-LINE']].copy()
df


Unnamed: 0,MOVIES,GENRE,ONE-LINE
0,Blood Red Sky,"\nAction, Horror, Thriller",\nA woman with a mysterious illness is forced ...
1,Masters of the Universe: Revelation,"\nAnimation, Action, Adventure",\nThe war for Eternia begins again in what may...
2,The Walking Dead,"\nDrama, Horror, Thriller",\nSheriff Deputy Rick Grimes wakes up from a c...
3,Rick and Morty,"\nAnimation, Adventure, Comedy",\nAn animated series that follows the exploits...
4,Army of Thieves,"\nAction, Crime, Horror","\nA prequel, set before the events of Army of ..."
...,...,...,...
9994,The Imperfects,"\nAdventure, Drama, Fantasy",\nAdd a Plot\n
9995,Arcane,"\nAnimation, Action, Adventure",\nAdd a Plot\n
9996,Heart of Invictus,"\nDocumentary, Sport",\nAdd a Plot\n
9997,The Imperfects,"\nAdventure, Drama, Fantasy",\nAdd a Plot\n


**CHECKING FOR MISSING VALUES**

In [57]:
# Check for missing values
print(f"üîç Missing values before cleaning:")
print(df.isnull().sum())


üîç Missing values before cleaning:
MOVIES       0
GENRE       80
ONE-LINE     0
dtype: int64


**DROPPING MISSING VALUES**

In [58]:
df.dropna(inplace=True)


**CHECKING AND DROPPING DUPLICATES**

In [59]:
# Check for duplicates
print(f"üîç Number of duplicate rows: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)


üîç Number of duplicate rows: 847


**CHECKING AFTER DATA CLEANING**

In [60]:
# Check for duplicates
print(f"üîç Number of duplicate rows: {df.duplicated().sum()}")
        
        # Basic data information
print(f" Dataset info after preprocessing:")
print(f" Total movies: {len(df)}")
print(f" Unique genres: {df['GENRE'].nunique()}")
    

üîç Number of duplicate rows: 0
 Dataset info after preprocessing:
 Total movies: 9072
 Unique genres: 510


**DATA PREPROCESSING**

In [61]:
# =============================================================================
# 2. TEXT PREPROCESSING AND FEATURE ENGINEERING
# =============================================================================
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove specific unwanted phrases
    if text == 'add a plot' :
        df.drop(df[df['ONE-LINE'] == 'add a plot'].index, inplace=True)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


**APPLYING PREPROCESSING STEPS**

In [62]:
for col in ['MOVIES', 'GENRE', 'ONE-LINE']:
    df[col] = df[col].apply(preprocess_text)
    


In [63]:

# Combine genre and one-line description to create content for recommendations
df['content'] = df['GENRE'] + " " + df['ONE-LINE']

print(" the sample of created content features:")
df.head()


 the sample of created content features:


Unnamed: 0,MOVIES,GENRE,ONE-LINE,content
0,blood red sky,action horror thriller,a woman with a mysterious illness is forced in...,action horror thriller a woman with a mysterio...
1,masters of the universe revelation,animation action adventure,the war for eternia begins again in what may b...,animation action adventure the war for eternia...
2,the walking dead,drama horror thriller,sheriff deputy rick grimes wakes up from a com...,drama horror thriller sheriff deputy rick grim...
3,rick and morty,animation adventure comedy,an animated series that follows the exploits o...,animation adventure comedy an animated series ...
4,army of thieves,action crime horror,a prequel set before the events of army of the...,action crime horror a prequel set before the e...


**INITIALIZING VECTORIZER**

In [64]:
# Initialize TF-IDF Vectorizer with specific parameters
# - stop_words="english": removes common English words like "the", "and", etc.
# - max_df=0.7: ignores terms that appear in more than 70% of documents
# - min_df=2: ignores terms that appear in fewer than 2 documents
# - ngram_range=(1, 2): considers both single words and word pairs
# - max_features=10000: limits the number of features to manage memory
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.7,
    min_df=2,
    ngram_range=(1, 2),
    max_features=10000
)

# Fit and transform the text content into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(df['content'])

print(f"‚úÖ TF-IDF matrix created with shape: {tfidf_matrix.shape}")
print(f"   Vocabulary size: {len(vectorizer.get_feature_names_out())}")


‚úÖ TF-IDF matrix created with shape: (9072, 10000)
   Vocabulary size: 10000


**COSINE SIMILARITY**

In [41]:
# =============================================================================
# 4. COSINE SIMILARITY CALCULATION
# =============================================================================
print("\n" + "=" * 60)
print(" CALCULATING COSINE SIMILARITY")
print("=" * 60)

# Compute cosine similarity between all movies based on their TF-IDF vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"‚úÖ Cosine similarity matrix computed with shape: {cosine_sim.shape}")



 CALCULATING COSINE SIMILARITY
‚úÖ Cosine similarity matrix computed with shape: (8662, 8662)


**CREATING INDEX MAPPING**

In [67]:
print("\n" + "=" * 60)
print("üéØ BUILDING RECOMMENDATION ENGINE")
print("=" * 60)

# Reset index for easier lookup and create mapping from movie titles to indices
df = df.reset_index(drop=True) # Reset index after dropping rows and prevents the use of old indexes
indices = pd.Series(df.index, index=df['MOVIES']).drop_duplicates()

print(f"‚úÖ Created index mapping for {len(indices)} movies")




üéØ BUILDING RECOMMENDATION ENGINE
‚úÖ Created index mapping for 9072 movies


In [68]:
# ==========================================================================
# 6. RECOMMENDATION FUNCTION
# =============================================================================
# Define the recommendation function
def recommend_movies(movie_title, n=5, genre_filter=None):
    
    # Check if movie exists in the dataset
    if movie_title not in indices:
        # Try to find similar titles in case of typo or case mismatch
        similar_titles = [title for title in indices.index if movie_title.lower() in title.lower()]
        if similar_titles:
            return f" Movie '{movie_title}' not found. Did you mean: {', '.join(similar_titles[:3])}?"
        else:
            return f" Movie '{movie_title}' not found in dataset!"
    
    # Get the index of the movie
    idx = indices[movie_title]
    
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get scores of the top n+5 most similar movies (including itself)
    sim_scores = sim_scores[1:n+6]  # Get extra in case we need to filter by genre
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Get the similarity scores for recommended movies
    similarity_values = [i[1] for i in sim_scores]
    
    # Create a DataFrame with the recommendations
    recommendations = df[['MOVIES', 'GENRE']].iloc[movie_indices].copy()
    recommendations['Similarity_Score'] = similarity_values
    
    # Apply genre filter if specified
    if genre_filter:
        genre_filter = genre_filter.lower()
        recommendations = recommendations[
            recommendations['GENRE'].str.lower().str.contains(genre_filter)
        ]
    
    # Return top n recommendations
    return recommendations.head(n)

print("‚úÖ Recommendation function created successfully!")


‚úÖ Recommendation function created successfully!


In [69]:

# =============================================================================
# 7. TESTING THE RECOMMENDATION SYSTEM
# =============================================================================
print("\n" + "=" * 60)
print(" TESTING THE RECOMMENDATION SYSTEM")
print("=" * 60)

# Test with a movie title
test_movie = "blood red sky"  # Change this to any movie in your dataset

print(f"\n Recommendations for '{test_movie}':")
recommendations = recommend_movies(test_movie, n=5)



# Demonstrate with genre filtering
print(f"\n Recommendations for '{test_movie}' in similar genre:")
genre_filtered_recommendations = recommend_movies(test_movie, n=5, genre_filter="horror")

if isinstance(genre_filtered_recommendations, pd.DataFrame):
    print(genre_filtered_recommendations.to_string(index=False))
else:
    print(genre_filtered_recommendations)

# Show sample of available movies
print(f"\n Sample of available movies in dataset:")
df['MOVIES'].head(10).to_string(index=False)

print("\n" + "=" * 60)
print(" MOVIE RECOMMENDATION SYSTEM READY!")
print("=" * 60)



 TESTING THE RECOMMENDATION SYSTEM

 Recommendations for 'blood red sky':

 Recommendations for 'blood red sky' in similar genre:
       MOVIES                 GENRE  Similarity_Score
       flight    drama horror scifi          0.276163
the bad batch action horror mystery          0.180477

 Sample of available movies in dataset:

 MOVIE RECOMMENDATION SYSTEM READY!
