In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

In [2]:
data = pd.read_csv("clean_movie_data.csv")
data.head()

Unnamed: 0,Movie Name,Year of Release,Watch Time,Genre,Movie Rating,Metascore of movie,Director,Cast,Votes,Description
0,Oppenheimer,2023,180 min,"\nBiography, Drama, History",8.6,88.0,Christopher Nolan,"Cillian Murphy,Emily Blunt,Matt Damon,Robert D...",368392,"The story of American scientist, J. Robert Opp..."
1,Barbie,2023,114 min,"\nAdventure, Comedy, Fantasy",7.4,80.0,Greta Gerwig,"Margot Robbie,Ryan Gosling,Issa Rae,Kate McKinnon",246653,Barbie suffers a crisis that leads her to ques...
2,Heart of Stone,2023,122 min,"\nAction, Crime, Thriller",5.7,44.0,Tom Harper,"Gal Gadot,Jamie Dornan,Alia Bhatt,Jing Lusi",29166,An intelligence operative for a shadowy global...
3,"Red, White & Royal Blue",2023,118 min,"\nComedy, Romance",7.1,62.0,Matthew López,"Taylor Zakhar Perez,Nicholas Galitzine,Uma Thu...",22778,When the feud between the son of the American ...
4,Blue Beetle,2023,127 min,"\nAction, Adventure, Sci-Fi",6.8,61.0,Angel Manuel Soto,"Xolo Maridueña,Bruna Marquezine,Becky G,Damián...",11680,An alien scarab chooses college graduate Jaime...


In [3]:
data.rename(columns = {'Movie Name':'Title'}, inplace = True)

In [4]:
data.head()

Unnamed: 0,Title,Year of Release,Watch Time,Genre,Movie Rating,Metascore of movie,Director,Cast,Votes,Description
0,Oppenheimer,2023,180 min,"\nBiography, Drama, History",8.6,88.0,Christopher Nolan,"Cillian Murphy,Emily Blunt,Matt Damon,Robert D...",368392,"The story of American scientist, J. Robert Opp..."
1,Barbie,2023,114 min,"\nAdventure, Comedy, Fantasy",7.4,80.0,Greta Gerwig,"Margot Robbie,Ryan Gosling,Issa Rae,Kate McKinnon",246653,Barbie suffers a crisis that leads her to ques...
2,Heart of Stone,2023,122 min,"\nAction, Crime, Thriller",5.7,44.0,Tom Harper,"Gal Gadot,Jamie Dornan,Alia Bhatt,Jing Lusi",29166,An intelligence operative for a shadowy global...
3,"Red, White & Royal Blue",2023,118 min,"\nComedy, Romance",7.1,62.0,Matthew López,"Taylor Zakhar Perez,Nicholas Galitzine,Uma Thu...",22778,When the feud between the son of the American ...
4,Blue Beetle,2023,127 min,"\nAction, Adventure, Sci-Fi",6.8,61.0,Angel Manuel Soto,"Xolo Maridueña,Bruna Marquezine,Becky G,Damián...",11680,An alien scarab chooses college graduate Jaime...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 906 entries, 0 to 905
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               906 non-null    object 
 1   Year of Release     906 non-null    int64  
 2   Watch Time          880 non-null    object 
 3   Genre               906 non-null    object 
 4   Movie Rating        862 non-null    float64
 5   Metascore of movie  826 non-null    float64
 6   Director            906 non-null    object 
 7   Cast                906 non-null    object 
 8   Votes               862 non-null    object 
 9   Description         906 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 70.9+ KB


In [6]:
data.rename(columns={'Unnamed: 0': 'movie_id'}, inplace=True)

In [9]:
columns=['Cast','Director','Genre','Title','Description']

In [10]:
#no null values
data[columns].isnull().values.any() 

False

In [11]:
def get_important_features(data):
    important_features=[]
    for i in range (0,data.shape[0]):
        important_features.append(data['Title'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Description'][i]+' '+data['Cast'])
    return important_features

In [12]:
#creating a column to hold the combined strings
data['important_features']=get_important_features(data)

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Description'])
print(tfidf_matrix.shape)

(906, 5330)


In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()
#indices['Stillwater']
#sim_scores = list(enumerate(cosine_sim[indices['Stillwater']]))

In [16]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    # Get the pair-wise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar movies
    similar_movies = data['Title'].iloc[movie_indices]
    result_dict = {"Movies": similar_movies, "SimilarityScore": [i[1] for i in sim_scores]}
    final_df = pd.DataFrame(result_dict)
    final_df.reset_index(drop=True, inplace=True)
    return final_df

In [17]:
get_recommendations('Spider-Man: Far from Home')

Unnamed: 0,Movies,SimilarityScore
0,A Quiet Place Part II,0.188271
1,Avengers: Endgame,0.177398
2,Joker: Folie à Deux,0.155667
3,Spider-Man: Into the Spider-Verse,0.153504
4,Asteroid City,0.141503


In [18]:
get_recommendations('Barbie')

Unnamed: 0,Movies,SimilarityScore
0,Passages,0.139777
1,American Beauty,0.138918
2,Harry Potter and the Deathly Hallows: Part 1,0.129069
3,Mother!,0.120637
4,She Came to Me,0.119428


In [19]:
data.info()
new = data.drop(columns=['Year of Release','Watch Time','Genre','Movie Rating',
                         'Metascore of movie','Director','Cast','Votes','Description'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 906 entries, 0 to 905
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               906 non-null    object 
 1   Year of Release     906 non-null    int64  
 2   Watch Time          880 non-null    object 
 3   Genre               906 non-null    object 
 4   Movie Rating        862 non-null    float64
 5   Metascore of movie  826 non-null    float64
 6   Director            906 non-null    object 
 7   Cast                906 non-null    object 
 8   Votes               862 non-null    object 
 9   Description         906 non-null    object 
 10  important_features  906 non-null    object 
dtypes: float64(2), int64(1), object(8)
memory usage: 78.0+ KB


In [20]:
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))