In [42]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

In [22]:
ratings = pd.read_csv('ratings_small.csv')
keywords = pd.read_csv('keywords.csv')
movies_metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
credits = pd.read_csv('credits.csv')

In [26]:
# Displaying the first few rows of each dataset to understand their structure
ratings_head = ratings.head()
keywords_head = keywords.head()
movies_metadata_head = movies_metadata.head()
credits_head = credits.head()

In [28]:
# Displaying the column information for each dataset
ratings_info = ratings.info()
keywords_info = keywords.info()
movies_metadata_info = movies_metadata.info()
credits_info = credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_c

In [30]:
ratings_head, keywords_head, movies_metadata_head, credits_head

(   userId  movieId  rating   timestamp
 0       1       31     2.5  1260759144
 1       1     1029     3.0  1260759179
 2       1     1061     3.0  1260759182
 3       1     1129     2.0  1260759185
 4       1     1172     4.0  1260759205,
       id                                           keywords
 0    862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
 1   8844  [{'id': 10090, 'name': 'board game'}, {'id': 1...
 2  15602  [{'id': 1495, 'name': 'fishing'}, {'id': 12392...
 3  31357  [{'id': 818, 'name': 'based on novel'}, {'id':...
 4  11862  [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...,
    adult                              belongs_to_collection    budget  \
 0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
 1  False                                                NaN  65000000   
 2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
 3  False                                                NaN  16000000   
 4  False  {'id': 9687

In [32]:
# Convert 'id' column to integer in movies_metadata for consistent merging
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')

# Merge the datasets on the 'id' column
merged_data = pd.merge(movies_metadata[['id', 'title', 'genres', 'overview', 'vote_average']],
                       keywords,
                       on='id',
                       how='left')

In [34]:
# Function to extract genres from the JSON-like string in the 'genres' column
def extract_genres(genres_str):
    try:
        genres_list = literal_eval(genres_str)
        return [genre['name'] for genre in genres_list]
    except:
        return []

In [36]:
# Function to extract keywords from the JSON-like string in the 'keywords' column
def extract_keywords(keywords_str):
    try:
        keywords_list = literal_eval(keywords_str)
        return [keyword['name'] for keyword in keywords_list]
    except:
        return []

In [38]:
# Apply the extraction functions
merged_data['genres'] = merged_data['genres'].apply(extract_genres)
merged_data['keywords'] = merged_data['keywords'].apply(extract_keywords)

# Display the first few rows of the merged data
merged_data_head = merged_data.head()
merged_data_head

Unnamed: 0,id,title,genres,overview,vote_average,keywords
0,862.0,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",7.7,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844.0,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,6.9,"[board game, disappearance, based on children'..."
2,15602.0,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,6.5,"[fishing, best friend, duringcreditsstinger, o..."
3,31357.0,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",6.1,"[based on novel, interracial relationship, sin..."
4,11862.0,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,5.7,"[baby, midlife crisis, confidence, aging, daug..."


In [62]:
# Selecting three "query" movies
query_movies = ['Toy Story', 'Jumanji', 'Father of the Bride Part II']

# Filtering the dataset to retain only the necessary columns and rows
filtered_data = merged_data[['title', 'genres', 'keywords']].dropna()

# Converting genres and keywords lists into strings for TF-IDF vectorization
filtered_data['genres'] = filtered_data['genres'].apply(lambda x: ' '.join(x))
filtered_data['keywords'] = filtered_data['keywords'].apply(lambda x: ' '.join(x))

# Combining genres and keywords into a single string for vectorization
filtered_data['combined_features'] = filtered_data['genres'] + ' ' + filtered_data['keywords']

In [64]:
# TF-IDF vectorization of the combined features
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data['combined_features'])

# Finding indices of the query movies in the filtered dataset
query_indices = filtered_data[filtered_data['title'].isin(query_movies)].index

In [66]:
# Calculating cosine similarity for each query movie
similarity_results = {}
for idx in query_indices:
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    similar_indices = cosine_sim.argsort()[-11:-1][::-1]  # Top 10 similar movies (excluding the movie itself)
    similar_movies = filtered_data.iloc[similar_indices]['title'].tolist()
    similarity_results[filtered_data.iloc[idx]['title']] = similar_movies

# Displaying the top 10 similar movies for each query
similarity_results

{'Toy Story': ['Toy Story That Time Forgot',
  'Small Soldiers',
  'Barbie and the Three Musketeers',
  'Toy Story 3',
  'Dolls',
  "Child's Play",
  "Child's Play 2",
  'The Transformers: The Movie',
  'The Indian in the Cupboard',
  'Ted'],
 'Jumanji': ['The Games Maker',
  'Mostly Ghostly',
  'Stung',
  'Middle School: The Worst Years of My Life',
  'Karlsson on the Roof',
  'Mostly Ghostly: Have You Met My Ghoulfriend?',
  'Where the Wild Things Are',
  'In the Name of the King III',
  'Clue',
  'Mostly Ghostly 3: One Night in Doom House'],
 'Father of the Bride Part II': ['Blueberry Hill',
  'Wedding Doll',
  'In Good Company',
  'Equinox Flower',
  'Sherrybaby',
  'On Golden Pond',
  'Julia Misbehaves',
  'My Only Sunshine',
  'Gas Food Lodging',
  'Peppermint Soda']}