# NLP Section - Movies recommendation system (Capstone project)

### Responsible team member: Rene Ortiz

In [None]:
#!pip install ydata-profiling

In [None]:
import pandas as pd
import numpy as np

#EDA Profiling library
#from ydata_profiling import ProfileReport

In [None]:
# load movie dataset from Google Drive using pandas

from google.colab import drive
drive.mount('/content/drive')

# Load CSV file
df = pd.read_csv('/content/drive/MyDrive/Capstone_Project/movies.csv')

df.head(3)

In [None]:
df.dtypes

In [None]:
# I created this function after the initial trainings as I realized a json string is not a good strategy, JSON needs to be parse for better results
import ast

# clean function for genre and keyword fields
def extract_names(json_str):
    try:
        items = ast.literal_eval(json_str)
        return " ".join([item['name'] for item in items if 'name' in item])
    except (ValueError, SyntaxError):
        return ""



In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_notebook_iframe()

## *Recommendation Models Section :  TF-IDF , BERT and LSTM*

# TERM Frequency-Inverse Document Frequency Recommendation system using the following logic:

- Combining features like genres, keywords, and overview text into a single string for each movie.

- Converting text into vectors using this techniques: TF-IDF (Term Frequency-Inverse Document Frequency) and CountVectorizer.

- Calculating similarity between movies using cosine similarity.

- Returning the top-N most similar movies for a given input movie.

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
df_clean = df[['title', 'overview', 'genres', 'keywords', 'popularity', 'release_date']].dropna()

# format JSON strings from genre and keyboards
df_clean['genres'] = df_clean['genres'].apply(extract_names)
df_clean['keywords'] = df_clean['keywords'].apply(extract_names)

# Get the from each
df_text = df_clean[['title', 'overview', 'genres', 'keywords']]
df_text.dropna(inplace=True)


In [None]:
df_text.head()

In [None]:
# Combine text into a single feature
def combine_features(row):
    return f"{row['overview']} {row['genres']} {row['keywords']}"

df_text['combined_text'] = df_text.apply(combine_features, axis=1)

In [None]:
df_text.head()

In [None]:
df_text['combined_text'][0]

In [None]:
# Remove stop words and Word Cloud for 5 movies (reference purposes only)
def clean_text(text):
    tokens = text.lower().split()
    return " ".join([word for word in tokens if word not in stop_words and word.isalpha()])

df_text['clean_text'] = df_text['combined_text'].apply(clean_text)

# Generate word cloud for first 5 movies
for i in range(5):
    wc = WordCloud(width=600, height=400, background_color='white').generate(df_text['clean_text'].iloc[i])
    plt.figure(figsize=(6, 4))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(df['title'].iloc[i])
    plt.show()

# Clean text to vectors using TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df_text['clean_text'])

In [None]:
tfidf_matrix.indices

In [None]:
df_clean.head()

In [None]:
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce').dt.year
df_clean['release_year'] = df_clean['release_date'].fillna(0).astype(int)
df_clean['popularity'] = pd.to_numeric(df_clean['popularity'], errors='coerce').fillna(0)
# metadata: release year and popularity
metadata = df_clean[['release_year', 'popularity']].fillna(0)

In [None]:
metadata.head()

In [None]:
# normalize the metadata
scaler = MinMaxScaler()
normalized_metadata = scaler.fit_transform(metadata[['release_year', 'popularity']])

In [None]:
# combine TF-IDF vectors with metadata (This steps needs a GPU otherwise it takes significant time)
tfidf_dense = tfidf_matrix.toarray()

In [None]:
# stack features
hybrid_features_tfidf = np.hstack([tfidf_dense, normalized_metadata])

In [None]:
# Compute Cosine Similarity
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim = cosine_similarity(hybrid_features_tfidf)

In [None]:
cosine_sim

In [None]:
# compute similarity matrix for the hybrid TF-IDF + metadata model
similarity_matrix = cosine_similarity(hybrid_features_tfidf)

# Functions to call recommendations, sim-scores, genre the TMDB API (queryposters and cast)

In [None]:
import requests
from IPython.display import Image, display

# API key
api_key = "b400409e22d456acb002b98fa90b2c2d" # I got this key by registering on TMDB website

# get poster URL from TMDb
def get_poster_url(movie_title):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_title}"
        response = requests.get(url)
        data = response.json()
        if data["results"] and data["results"][0].get("poster_path"):
            poster_path = data["results"][0]["poster_path"]
            return f"https://image.tmdb.org/t/p/w300{poster_path}"
    except Exception as e:
        print(f"Error fetching poster for {movie_title}: {e}")
    return None

In [None]:
# Recommendation function with poster display
def recommend_movies(title, top_n=5):
    idx = df_clean[df_clean['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        print("Movie not found.")
        return

    idx = idx[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]

    recommendations = df_clean[['title', 'genres', 'keywords', 'overview']].iloc[movie_indices].copy()
    recommendations['similarity_score'] = [sim[1] for sim in sim_scores]

    # Display posters and details
    for _, row in recommendations.iterrows():
        title = row['title']
        poster_url = get_poster_url(title)
        print(f"\n {title} (Similarity Score: {row['similarity_score']:.3f})")
        print(f"Genres: {row['genres']}")
        print(f"Keywords: {row['keywords']}")
        if poster_url:
            display(Image(url=poster_url))
        else:
            print("Poster not found.")

    return recommendations.sort_values(by='similarity_score', ascending=False)

In [None]:
def explain_recommendation(input_title, recommended_df):
    input_row = df_clean[df_clean['title'].str.lower() == input_title.lower()].iloc[0]
    input_genres = set(input_row['genres'].split(','))
    input_keywords = set(input_row['keywords'].split(','))

    explanations = []

    for _, row in recommended_df.iterrows():
        rec_genres = set(row['genres'].split(','))
        rec_keywords = set(row['keywords'].split(','))
        common_genres = input_genres.intersection(rec_genres)
        common_keywords = input_keywords.intersection(rec_keywords)

        explanation = {
            'title': row['title'],
            'similarity_score': row['similarity_score'],
            'shared_genres': ', '.join(common_genres),
            'shared_keywords': ', '.join(common_keywords)
        }
        explanations.append(explanation)

    return pd.DataFrame(explanations)


In [None]:
recs = recommend_movies("Superman", top_n=5)
explanations = explain_recommendation("Superman", recs)
display(explanations)

# Movie Clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df_text['combined_text'])

In [None]:
num_clusters = 5  # We can try less or more depending how we want to present this on the project
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_text['cluster'] = kmeans.fit_predict(X)

In [None]:
pca = PCA(n_components=2, random_state=42)
reduced = pca.fit_transform(X.toarray())

df_text['pca1'] = reduced[:, 0]
df_text['pca2'] = reduced[:, 1]

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x="pca1", y="pca2", hue="cluster", palette="tab10", data=df_text, s=60, alpha=0.7
)
# For refence, I'm adding labels for some sample movies
sample_titles = df_text.groupby('cluster').apply(lambda x: x.sample(1, random_state=42))
for _, row in sample_titles.iterrows():
    plt.text(row['pca1'], row['pca2'], row['title'], fontsize=9)
plt.title("Movie Clusters Based on Content (TF-IDF + KMeans + PCA)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Re-use your TF-IDF vectorizer
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("\nTop terms per cluster:")
for i in range(num_clusters):
    print(f"\nCluster {i}:")
    for j in range(10):
        print(f"  {terms[order_centroids[i, j]]}")

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=50, n_iter=300, random_state=42)
X_embedded = tsne.fit_transform(X.toarray())

df_text['tsne1'], df_text['tsne2'] = X_embedded[:,0], X_embedded[:,1]

# Plot with t-SNE
plt.figure(figsize=(10,6))
sns.scatterplot(x='tsne1', y='tsne2', hue='cluster', data=df_text, palette='tab10', alpha=0.7)
plt.title("Movie Clusters Based on Content (TF-IDF + KMeans + t-SNE)")
plt.grid(True)
plt.show()


# Hybrid Approach - BERT

This next section will combine multiple text and numeric features:

- Textual features (overview, keywords)

- Metadata (genre, release year, cast)

- Ratings / popularity scores

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
df_clean = df[['title', 'overview', 'genres', 'keywords', 'popularity', 'release_date']].dropna()

# format JSON strings from genre and keyboards
df_clean['genres'] = df_clean['genres'].apply(extract_names)
df_clean['keywords'] = df_clean['keywords'].apply(extract_names)

# Get the from each
df_text = df_clean[['title', 'overview', 'genres', 'keywords']]
df_text.dropna(inplace=True)


In [None]:
df_clean.head()

In [None]:
# text fields for BERT input
df_clean['combined_text'] = df['overview'] + " " + df['genres'] + " " + df['keywords']

# release date to year
#df_clean['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year.fillna(0).astype(int)
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce').dt.year
df_clean['release_year'] = df_clean['release_date'].fillna(0).astype(int)
df_clean['popularity'] = pd.to_numeric(df_clean['popularity'], errors='coerce').fillna(0)
# metadata: release year and popularity
metadata = df_clean[['release_year', 'popularity']].fillna(0)

In [None]:
# load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


In [None]:
# Due to previous errors, I will replace NaN or non-string values with an empty string
df_clean['combined_text'] = df_clean['combined_text'].fillna('').astype(str)
# encode combined text
bert_embeddings = model.encode(df_clean['combined_text'].tolist(), show_progress_bar=True)

In [None]:
# movies numerical metadata
#metadata = df_clean[['release_date', 'popularity']].fillna(0)

# normalize
scaler = MinMaxScaler()
normalized_metadata = scaler.fit_transform(metadata)

# BERT + Metadata
hybrid_features = np.hstack([bert_embeddings, normalized_metadata])

In [None]:
# computer similirity cosine
similarity_matrix = cosine_similarity(hybrid_features)

In [None]:
# updated recommendation movies w/ similarity scores, genre and keywords
def recommend_movies(title, top_n=10):
    idx = df_clean[df_clean['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        print("Movie not found.")
        return

    idx = idx[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]

    recommendations = df_clean[['title', 'release_year', 'genres', 'keywords', 'overview']].iloc[movie_indices].copy()
    recommendations['similarity_score'] = [sim[1] for sim in sim_scores]

    for _, row in recommendations.iterrows():
        movie_title = row['title']
        release_year = row['release_year']
        print(f"\n {movie_title} ({release_year}) — Similarity Score: {row['similarity_score']:.3f}")
        print(f"Genres: {row['genres']}")
        print(f"Keywords: {row['keywords']}")
        poster_url = get_poster_url(movie_title)
        if poster_url:
            display(Image(url=poster_url))
        else:
            print("Poster not found.")
        print("-" * 60)

    return recommendations.sort_values(by='similarity_score', ascending=False)

In [None]:
import requests
from IPython.display import Image, display

# API key
api_key = "b400409e22d456acb002b98fa90b2c2d" # I got this key by registering on TMDB website

# get poster URL from TMDb
def get_poster_url(movie_title):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_title}"
        response = requests.get(url)
        data = response.json()
        if data["results"] and data["results"][0].get("poster_path"):
            poster_path = data["results"][0]["poster_path"]
            return f"https://image.tmdb.org/t/p/w300{poster_path}"
    except Exception as e:
        print(f"Error fetching poster for {movie_title}: {e}")
    return None

In [None]:
recommend_movies("Toy Story", top_n=5)

# K-meamns clustering for Bert recommendation system

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(bert_embeddings)

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(bert_embeddings)
df['pca1'] = pca_result[:, 0]
df['pca2'] = pca_result[:, 1]

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster', palette='tab10', alpha=0.6)

# Add labels for a few example movies (1 per cluster)
sample_titles = df.groupby('cluster').apply(lambda x: x.sample(1, random_state=11)).reset_index(drop=True)
for _, row in sample_titles.iterrows():
    plt.text(row['pca1'], row['pca2'], row['title'], fontsize=9)

plt.title("BERT-based Movie Clusters")
plt.legend(title='Cluster')
plt.show()

In [None]:
print("\nTop representative movies per BERT-based cluster:")
for i in range(n_clusters):
    print(f"\nCluster {i}:")

    # get indices of items in this cluster
    cluster_indices = df[df['cluster'] == i].index

    # get the centroid of the cluster
    centroid = kmeans.cluster_centers_[i].reshape(1, -1)

    # compute cosine similarity to the centroid
    cluster_embeddings = bert_embeddings[cluster_indices]
    sims = cosine_similarity(cluster_embeddings, centroid).flatten()

    # get top 5 most representative movies
    top_indices = cluster_indices[np.argsort(sims)[-5:][::-1]]
    for idx in top_indices:
        print(f"  {df.loc[idx, 'title']} - {df.loc[idx, 'genres']}")


# Unsupervised LSTM Model (recommendation system).

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load dataframe (df) from google drive running the top cells

# clean and combine text features as with TF-IDF and BERT
df_clean = df[['title', 'overview', 'genres', 'keywords', 'popularity', 'release_date']].dropna()

# combine text fields into one
df_clean['combined_text'] = df_clean['title'] + " " + df_clean['overview'] + " " + df_clean['genres'] + " " + df_clean['keywords']

# normalize popularity and release date
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce').dt.year.fillna(0).astype(int)
scaler = MinMaxScaler()
df_clean[['popularity', 'release_date']] = scaler.fit_transform(df_clean[['popularity', 'release_date']])



In [None]:
# example of combined_text for reference purposes
df_clean['combined_text'][1]

In [None]:
df_clean.head()

In [None]:
# format JSON strings from genre and keyboards
df_clean['genres'] = df_clean['genres'].apply(extract_names)
df_clean['keywords'] = df_clean['keywords'].apply(extract_names)

In [None]:
df_clean.head()

In [None]:
# text preprocessing and tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_clean['combined_text'])

sequences = tokenizer.texts_to_sequences(df_clean['combined_text'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# metadata as additional input
metadata_features = df_clean[['popularity', 'release_date']].values

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate

# inputs for the model
text_input = Input(shape=(100,), name="text_input")
meta_input = Input(shape=(2,), name="meta_input")

# LSTM on text
embedding = Embedding(input_dim=10000, output_dim=64, input_length=100)(text_input)
lstm_out = LSTM(64)(embedding)

# combine LSTM and metadata
merged = Concatenate()([lstm_out, meta_input])
dense = Dense(64, activation='relu')(merged)
output = Dense(32, activation='relu')(dense)  # This becomes the embedding vector for recommendations

# define model
model = Model(inputs=[text_input, meta_input], outputs=output)
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
# dummy output to learn identity (FYI each movie vector is like a label)
X_text = padded_sequences
X_meta = metadata_features

# random targets for training embeddings
y = np.random.rand(len(df_clean), 32)

# train the model
model.fit([X_text, X_meta], y, epochs=100, batch_size=32)

In [None]:
# Get learned embeddings for all movies
movie_embeddings = model.predict([X_text, X_meta])

from sklearn.metrics.pairwise import cosine_similarity

def recommend_lstm(movie_title, top_n=5):
    idx = df_clean[df_clean['title'].str.lower() == movie_title.lower()].index
    if len(idx) == 0:
        print("Movie not found.")
        return

    idx = idx[0]
    query_embedding = movie_embeddings[idx]
    sim_scores = cosine_similarity([query_embedding], movie_embeddings)[0]
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]

    recommendations = df_clean.iloc[top_indices][['title', 'genres', 'keywords', 'overview']].copy()
    recommendations['similarity_score'] = sim_scores[top_indices]

    return recommendations.sort_values(by='similarity_score', ascending=False)

In [None]:
recommend_lstm("Superman")

# GLOVE + LSTM Model w/ additional numeric features (Supervised training)

## 1st load the df from the top df code / google drive

In [None]:
import re
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# combine text fields
df['combined_text'] = df['overview'].fillna('') + " " + \
                      df['genres'].fillna('') + " " + \
                      df['keywords'].fillna('')

In [None]:
# function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

#apply to the combined text
df['combined_text'] = df['combined_text'].apply(clean_text)

In [None]:
#  text tokenize
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['combined_text'])
sequences = tokenizer.texts_to_sequences(df['combined_text'])
X_text = pad_sequences(sequences, maxlen=300)

In [None]:
# normalizing all nmerical features
df['release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year.fillna(0).astype(int)
df[['popularity', 'vote_average', 'vote_count', 'runtime']] = df[['popularity', 'vote_average', 'vote_count', 'runtime']].fillna(0)
numerical = df[['release_year', 'popularity', 'vote_average', 'vote_count', 'runtime']]
scaler = MinMaxScaler()
X_num = scaler.fit_transform(numerical)

In [None]:
# language encoding
df['original_language'] = df['original_language'].fillna('unknown')
le = LabelEncoder()
X_lang = le.fit_transform(df['original_language']).reshape(-1, 1)

# numerical + language
X_meta = np.hstack((X_num, X_lang))

# input for LSTM
X_final = [X_text, X_meta]

In [None]:
X_final

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# similarity scores based on vote average
vote_scores = df['vote_average'].values.reshape(-1, 1)
y_similarity = cosine_similarity(vote_scores)

In [None]:
# download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d /content/

In [None]:
# Glove embedding
embedding_index = {}
with open('/content/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((10000, embedding_dim))

for word, i in word_index.items():
    if i >= 10000:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout

# text input
text_input = Input(shape=(300,))
embed = Embedding(input_dim=10000, output_dim=100, weights=[embedding_matrix], input_length=300, trainable=False)(text_input)
lstm_out = LSTM(64)(embed)

# metadata input
meta_input = Input(shape=(X_meta.shape[1],))
meta_dense = Dense(32, activation='relu')(meta_input)

# combine
combined = Concatenate()([lstm_out, meta_dense])
combined = Dropout(0.3)(combined)
output = Dense(1, activation='sigmoid')(combined)

model = Model(inputs=[text_input, meta_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# sample dummy labels (1 if vote_average diff < 0.5 else 0)
labels = np.where(abs(df['vote_average'].values - df['vote_average'].values.mean()) < 0.5, 1, 0)

model.fit(X_final, labels, epochs=50, batch_size=32, validation_split=0.2)

In [None]:
from keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity

# xtract features from the penultimate layer
feature_extractor = Model(inputs=model.input, outputs=model.get_layer(index=-2).output)

# feature embeddings for all movies
movie_embeddings = feature_extractor.predict(X_final, batch_size=32)

In [None]:
# compute pairwise cosine similarity between all movies
similarity_matrix = cosine_similarity(movie_embeddings)

In [None]:
def recommend_movies_lstm(title, top_n=5):
    # find movie index
    idx = df[df['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        print("Movie not found.")
        return
    idx = idx[0]

    # Get similarity scores for the movie
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Skip the movie itself (first match) and select top-N
    sim_scores = sim_scores[1:top_n + 1]
    movie_indices = [i[0] for i in sim_scores]

    print(f"\nTop {top_n} similar movies to: {df.iloc[idx]['title']}")
    for i in movie_indices:
        title = df.iloc[i]['title']
        score = sim_scores[movie_indices.index(i)][1]
        print(f"{title} — Similarity Score: {score:.3f}")

In [None]:
recommend_movies_lstm("Transformers", top_n=5)

In [None]:
# Clustering
from sklearn.cluster import KMeans

# cluster into 5 groups
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(movie_embeddings)

In [None]:
# 5 movies per cluster for inspection
for i in range(num_clusters):
    print(f"\nCluster {i}:")
    sample_movies = df[df['cluster'] == i].sample(5, random_state=42)
    for title in sample_movies['title']:
        print(f"  - {title}")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("\nTop representative movies per LSTM-based cluster:")
for i in range(num_clusters):
    print(f"\nCluster {i}:")

    # Get indices of items in this cluster
    cluster_indices = df[df['cluster'] == i].index

    # Get the centroid of the cluster
    centroid = kmeans.cluster_centers_[i].reshape(1, -1)

    # Compute cosine similarity to the centroid
    cluster_embeddings = movie_embeddings[cluster_indices]
    sims = cosine_similarity(cluster_embeddings, centroid).flatten()

    # Get top 5 most representative movies
    top_indices = cluster_indices[np.argsort(sims)[-5:][::-1]]
    for idx in top_indices:
        print(f"  {df.loc[idx, 'title']} - {df.loc[idx, 'genres']}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

# Reduce to 2D using PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(movie_embeddings)

# Add PCA components to the DataFrame
df['pca1'] = reduced_embeddings[:, 0]
df['pca2'] = reduced_embeddings[:, 1]
df['cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(12, 8))
palette = sns.color_palette("hsv", len(df['cluster'].unique()))
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster', palette=palette, alpha=0.7)

# Optional: Add movie titles for a few representative samples per cluster
sample_titles = df.groupby('cluster').apply(lambda x: x.sample(1, random_state=42))
for _, row in sample_titles.iterrows():
    plt.text(row['pca1'], row['pca2'], row['title'], fontsize=8)

plt.title("LSTM-based Movie Clusters (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()

.