In [6]:
!pip install pandas numpy scikit-surprise ipywidgets transformers torch

# Import libraries
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import ipywidgets as widgets
from IPython.display import display
from transformers import AutoTokenizer, AutoModel
import torch


[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [25]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
# Load movies and ratings dataset
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Display column names to verify
print("Movies.csv columns:", movies_df.columns)
print("Ratings.csv columns:", ratings_df.columns)


Movies.csv columns: Index(['movieId', 'title', 'genres'], dtype='object')
Ratings.csv columns: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [27]:
# Define the reader for Surprise
reader = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))

# Load dataset into Surprise
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize SVD model
cf_model = SVD()

# Train the model
cf_model.fit(trainset)


KeyboardInterrupt: 

In [28]:
# Predict ratings for test set
predictions = cf_model.test(testset)

def precision_at_k(predictions, k=10, threshold=3.5):
    """Computes Precision@K: proportion of top-K recommendations that are relevant."""
    user_est_true = {}

    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    precisions = []

    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(reverse=True, key=lambda x: x[0])  # Sort by predicted rating
        top_k = user_ratings[:k]
        relevant = sum((true_r >= threshold) for _, true_r in top_k)
        precisions.append(relevant / k)

    return np.mean(precisions)

# Calculate precision
precision_k = precision_at_k(predictions, k=10, threshold=3.5)
print(f"Precision@10: {precision_k:.4f}")

Precision@10: 0.6669


In [29]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir="./models")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased", cache_dir="./models")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [30]:
print(bert_model)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [35]:
# Normalize movie titles before storing embeddings
movies_df['title'] = movies_df['title'].str.strip().str.lower()

movie_embeddings = {}

for title in movies_df['title']:
    movie_embeddings[title] = get_movie_embedding(title, tokenizer, bert_model)


KeyboardInterrupt: 

In [39]:
import re

def clean_title(title):
    """Cleans movie titles by removing extra spaces, special characters, and normalizing case."""
    title = title.strip().lower()  # Convert to lowercase and remove leading/trailing spaces
    title = re.sub(r'\s+', ' ', title)  # Replace multiple spaces with a single space
    title = re.sub(r'[^a-z0-9\s:!?.()-]', '', title)  # Remove unusual special characters
    return title

# Apply cleaning to movie titles
movies_df['title'] = movies_df['title'].apply(clean_title)

print("Sample cleaned titles:", movies_df['title'].head(10))  # Check first 10 cleaned titles


Sample cleaned titles: 0                      toy story (1995)
1                        jumanji (1995)
2               grumpier old men (1995)
3              waiting to exhale (1995)
4    father of the bride part ii (1995)
5                           heat (1995)
6                        sabrina (1995)
7                   tom and huck (1995)
8                   sudden death (1995)
9                      goldeneye (1995)
Name: title, dtype: object


In [40]:
print("Sample movies in embeddings:", list(movie_embeddings.keys())[:10])

Sample movies in embeddings: ['toy story (1995)', 'jumanji (1995)', 'grumpier old men (1995)', 'waiting to exhale (1995)', 'father of the bride part ii (1995)', 'heat (1995)', 'sabrina (1995)', 'tom and huck (1995)', 'sudden death (1995)', 'goldeneye (1995)']


In [42]:
import torch

def get_movie_embeddings(movie_titles, tokenizer, model, batch_size=16):
    """Generate BERT embeddings for a list of movie titles efficiently in batches."""
    embeddings = {}
    
    for i in range(0, len(movie_titles), batch_size):
        batch_titles = movie_titles[i:i+batch_size]
        
        inputs = tokenizer(batch_titles, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Store embeddings using mean pooling
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)

        for j, title in enumerate(batch_titles):
            embeddings[title] = batch_embeddings[j].squeeze()
    
    return embeddings

# Normalize movie titles
movies_df['title'] = movies_df['title'].str.strip().str.lower()

# Generate embeddings for all movies
movie_embeddings = get_movie_embeddings(movies_df['title'].tolist(), tokenizer, bert_model)

print(f"Generated {len(movie_embeddings)} movie embeddings!")



Generated 62274 movie embeddings!


In [None]:
def find_similar_movies(movie_title, movie_embeddings, top_n=10):
    movie_title = clean_title(movie_title)  # Normalize input

    if movie_title not in movie_embeddings:
        print(f"Movie title '{movie_title}' not found! Check spelling.")
        return []
    
    target_embedding = movie_embeddings[movie_title].reshape(1, -1)
    
    similarities = {
        title: cosine_similarity(target_embedding, embedding.reshape(1, -1))[0][0] 
        for title, embedding in movie_embeddings.items() if title != movie_title
    }

    sorted_movies = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return [movie for movie, score in sorted_movies]

# Test with cleaned input
print(find_similar_movies("interstellar (2014)", movie_embeddings))


In [46]:
def hybrid_recommendation(movie_title, cf_model, bert_embeddings, movies_df, ratings_df, user_id=1, top_n=10):
    movie_title = movie_title.strip().lower()
    movies_df['title'] = movies_df['title'].str.strip().str.lower()

    if movie_title not in movies_df['title'].values:
        print("Movie not found! Please check the spelling.")
        return

    movie_id = movies_df[movies_df['title'] == movie_title]['movieId'].values[0]

    # Collaborative Filtering Predictions
    all_movie_ids = movies_df['movieId'].unique()
    svd_predictions = {pred.iid: pred.est for pred in [cf_model.predict(user_id, mid) for mid in all_movie_ids if mid != movie_id]}

    # BERT Similarity Scores
    bert_scores = {}
    query_embedding = bert_embeddings[movie_id]  # Assuming embeddings are stored in a dictionary

    for mid in all_movie_ids:
        if mid != movie_id:
            bert_scores[mid] = np.dot(query_embedding, bert_embeddings[mid])  # Cosine similarity

    # Normalize Scores
    min_max_scaler = MinMaxScaler()
    svd_values = np.array(list(svd_predictions.values())).reshape(-1, 1)
    bert_values = np.array(list(bert_scores.values())).reshape(-1, 1)

    svd_scaled = min_max_scaler.fit_transform(svd_values)
    bert_scaled = min_max_scaler.fit_transform(bert_values)

    # Weighted Hybrid Score
    hybrid_scores = {mid: 0.6 * svd_scaled[i] + 0.4 * bert_scaled[i] for i, mid in enumerate(svd_predictions.keys())}

    # Sort and Recommend
    top_movies = sorted(hybrid_scores, key=hybrid_scores.get, reverse=True)[:top_n]
    print(f"Top {top_n} recommended movies for '{movie_title}':")
    for i, mid in enumerate(top_movies, 1):
        print(f"{i}. {movies_df[movies_df['movieId'] == mid]['title'].values[0]}")




In [47]:
# Create a text input and button for recommendations
movie_input = widgets.Text(placeholder="Enter a movie title")
button = widgets.Button(description="Get Recommendations")
output = widgets.Output()

# Function to run on button click
def on_button_click(b):
    with output:
        output.clear_output()
        hybrid_recommendation(movie_input.value, cf_model, movies_df, ratings_df)

# Attach function to button click
button.on_click(on_button_click)

# Display widgets
display(movie_input, button, output)


Text(value='', placeholder='Enter a movie title')

Button(description='Get Recommendations', style=ButtonStyle())

Output()

In [14]:
print(movies_df['title'].head(10))

0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
7                   Tom and Huck (1995)
8                   Sudden Death (1995)
9                      GoldenEye (1995)
Name: title, dtype: object


In [48]:
# Search for Interstellar in the dataset
search_movie = "Interstellar"

# Find the row in the dataset
interstellar_row = movies_df[movies_df['title'].str.contains(search_movie, case=False, na=False)]

if interstellar_row.empty:
    print(f"Movie '{search_movie}' not found in dataset.")
else:
    movie_description = interstellar_row.iloc[0]['genres']  # If you have descriptions, replace 'genres' with 'description'
    print(f"Found movie: {interstellar_row.iloc[0]['title']}")
    print(f"Genres: {movie_description}")

    # Encode the movie description using BERT
    interstellar_embedding = model(**tokenizer(movie_description, return_tensors="pt", truncation=True, padding=True))

    # Compare with all movie descriptions
    similarities = []
    for index, row in movies_df.iterrows():
        desc = row['genres']
        desc_embedding = model(**tokenizer(desc, return_tensors="pt", truncation=True, padding=True))
        similarity_score = torch.cosine_similarity(interstellar_embedding.last_hidden_state.mean(dim=1),
                                                   desc_embedding.last_hidden_state.mean(dim=1)).item()
        similarities.append((row['title'], similarity_score))

    # Sort movies by similarity score
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Display the top 5 recommendations
    print("\nTop 5 similar movies to Interstellar:")
    for movie, score in similarities[:5]:
        print(f"{movie} (Similarity: {score:.4f})")


Found movie: interstellar (2014)
Genres: Sci-Fi|IMAX


TypeError: 'SVD' object is not callable

In [22]:
for pred in predictions[:10]:  # Checking first 10 predictions
    print(f"User: {pred.uid}, Movie: {pred.iid}, Estimated Rating: {pred.est}")

User: 127312, Movie: 56508, Estimated Rating: 3.2836101274094434
User: 28752, Movie: 106696, Estimated Rating: 3.9792754718733714
User: 3028, Movie: 3962, Estimated Rating: 2.957724904299895
User: 15852, Movie: 37857, Estimated Rating: 4.11855619326807
User: 136594, Movie: 86882, Estimated Rating: 4.017945502356065
User: 135238, Movie: 36519, Estimated Rating: 3.115317850032756
User: 57536, Movie: 72226, Estimated Rating: 3.929482614661796
User: 105549, Movie: 3429, Estimated Rating: 4.401164102704643
User: 159110, Movie: 2762, Estimated Rating: 3.9252881928577956
User: 12208, Movie: 6, Estimated Rating: 3.498705087063996
