In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
movies=pd.read_csv('data/movies.csv')
credits=pd.read_csv('data/credits.csv')
print("Datasets loaded successfully.")

Datasets loaded successfully.


here we just need to check for missing values and doing a mindful check of the dataset.

In [3]:
movies.columns
movies_clean=movies.drop(['homepage','runtime','budget','revenue','release_date','status','original_language','original_title','production_countries','spoken_languages','production_companies'],axis=1,inplace=False)
print("Done")

Done


In [4]:

movies_clean.dropna(inplace=True)
movies_clean['overview'] = movies_clean['overview'].fillna('')
movies_clean['tagline'] = movies_clean['tagline'].fillna('')
movies_clean.isnull().sum()

genres          0
id              0
keywords        0
overview        0
popularity      0
tagline         0
title           0
vote_average    0
vote_count      0
dtype: int64

In [5]:
import json
def extract_names(text):
    try:
        data=json.loads(text)
        return " ".join([d['name'] for d in data])
    except:
        return ""
movies_clean['genres']=movies_clean['genres'].apply(extract_names)
movies_clean['keywords']=movies_clean['keywords'].apply(extract_names)
print("Extracted names from genres and keywords.")

Extracted names from genres and keywords.


In [6]:
movies_clean.isnull().sum()
movies_clean['title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4795                                        Bang
4796                                      Primer
4798                                 El Mariachi
4799                                   Newlyweds
4801                            Shanghai Calling
Name: title, Length: 3959, dtype: object

In [7]:
movies_clean['tags']=movies_clean['title']+" "+movies_clean['overview']+" "+movies_clean['genres']+" "+movies_clean['keywords']
movies_final=movies_clean.drop(['overview','genres','keywords'],axis=1,inplace=False)
movies_final['tags']=movies_final['tags'].apply(lambda x:x.lower())
print("Done")

Done


In [8]:
movies_final.head()

Unnamed: 0,id,popularity,tagline,title,vote_average,vote_count,tags
0,19995,150.437577,Enter the World of Pandora.,Avatar,7.2,11800,"avatar in the 22nd century, a paraplegic marin..."
1,285,139.082615,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,pirates of the caribbean: at world's end capta...
2,206647,107.376788,A Plan No One Escapes,Spectre,6.3,4466,spectre a cryptic message from bond’s past sen...
3,49026,112.31295,The Legend Ends,The Dark Knight Rises,7.6,9106,the dark knight rises following the death of d...
4,49529,43.926995,"Lost in our world, found in another.",John Carter,6.1,2124,"john carter john carter is a war-weary, former..."


In [9]:
movies_final['tags']

0       avatar in the 22nd century, a paraplegic marin...
1       pirates of the caribbean: at world's end capta...
2       spectre a cryptic message from bond’s past sen...
3       the dark knight rises following the death of d...
4       john carter john carter is a war-weary, former...
                              ...                        
4795    bang a young woman in l.a. is having a bad day...
4796    primer friends/fledgling entrepreneurs invent ...
4798    el mariachi el mariachi just wants to play his...
4799    newlyweds a newlywed couple's honeymoon is upe...
4801    shanghai calling when ambitious new york attor...
Name: tags, Length: 3959, dtype: object

Now that the data is ready for the sentence transformer it's time to encode it into sentence transformer for further processing

In [10]:
from sentence_transformers import SentenceTransformer
model=SentenceTransformer('all-MiniLM-L6-v2')

print("Model loaded successfully.")
print("Generating embeddings for movie tags...")
vectors=model.encode(movies_final['tags'].tolist())

print("Embeddings generated successfully.")
print("Vector shape:", vectors.shape)


Model loaded successfully.
Generating embeddings for movie tags...
Embeddings generated successfully.
Vector shape: (3959, 384)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)
print("Cosine similarity matrix computed.")
print("Similarity matrix shape:", similarity.shape)

Cosine similarity matrix computed.
Similarity matrix shape: (3959, 3959)


In [12]:
def recommend(movie):
    indices = pd.Series(movies_final.index, index=movies_final['title']).drop_duplicates() # this makes the dataframe indexes and titles as the index for the dataframe
    idx=indices[movie] # find the movie for a given index

    similarity_scores=list(enumerate(similarity[idx])) # (1,89.0) 1-> enumeration and 0.89-> similarity score
    similarity_scores=sorted(similarity_scores,key=lambda x:x[1],reverse=True) # sort in descending order
    similarity_scores=similarity_scores[1:6] # display top 5 

    movie_indices=[i[0] for i in similarity_scores] # find the index of the movie
    return movies_final['title'].iloc[movie_indices] # find the title of the movie based on the index

In [13]:
recommend('Avatar')

61                      Jupiter Ascending
2403                               Aliens
539                            Titan A.E.
168     Final Fantasy: The Spirits Within
1294                             Serenity
Name: title, dtype: object

In [14]:
recommend('Avengers: Age of Ultron')

26    Captain America: Civil War
16                  The Avengers
68                      Iron Man
79                    Iron Man 2
31                    Iron Man 3
Name: title, dtype: object

In [15]:
recommend('The Dark Knight Rises')

65      The Dark Knight
1359             Batman
428      Batman Returns
119       Batman Begins
299      Batman Forever
Name: title, dtype: object

In [16]:
import pickle

with open('recommender_model.pkl', 'wb') as f:
    pickle.dump(similarity, f)

print('Model Saved')

Model Saved



In [17]:
pickle.dump(movies_final, open('final_df.pkl', 'wb'))
print("done")

done
