In [2]:
# conda install -q -c conda-forge gensim faiss-gpu -y

In [4]:
%%capture _messages

import pandas as pd
import gensim.downloader as api
from gensim.models import Word2Vec
import re

# Load the pre-trained Word2Vec model or train your own on the dataset.
print("Loading Word2Vec model...")
model = api.load('word2vec-google-news-300')

In [5]:
# Load the Movies_dataset.csv using pandas.
df = pd.read_csv('./data/Movies_dataset.csv')

def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase.
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    return text

# Apply the preprocessing to the 'Movie_Name' column.
df['title_cleaned'] = df['Movie_Name'].apply(preprocess_text)

# Remove duplicates (if any) based on the 'title_cleaned' column.
df = df.drop_duplicates(subset='title_cleaned', keep='first')

# Extract the cleaned movie titles from the DataFrame.
movies = df['title_cleaned'].tolist()

# Function to convert item names to vectors using the Word2Vec model.
def item_name_to_vector(item_name):
    try:
        return model[item_name]
    except KeyError:
        #print(f"Item '{item_name}' not found in the Word2Vec vocabulary.")
        return None

# Create a dictionary to store item vectors.
item_vectors = {item: item_name_to_vector(item) for item in movies}

# Remove items with no corresponding vectors (not in the Word2Vec vocabulary).
item_vectors = {item: vector for item, vector in item_vectors.items() if vector is not None}

# Display the number of items with valid vectors.
print(f"Number of items with valid vectors: {len(item_vectors)}")


Number of items with valid vectors: 540


In [9]:
# set up Faiss db
import faiss
import numpy as np

# Convert item vectors to a NumPy array for Faiss indexing.
item_vector_array = [vector for vector in item_vectors.values()]
item_vector_array = [vector.tolist() for vector in item_vector_array]

# Convert the list of lists into a 2D NumPy array.
item_vector_array = np.array(item_vector_array, dtype='float32')

# Initialize a Faiss index.
index = faiss.IndexFlatL2(model.vector_size)

# Add the item vectors to the index.
index.add(item_vector_array)

# Save the index to a file for future use.
faiss.write_index(index, './data/movie_index.faiss')

# lets look at the index keys
print(item_vectors.keys())

dict_keys(['transfusion', 'junge', 'devotion', 'babylon', 'plane', 'avatar', 'troll', 'fall', 'hex', 'medieval', 'smile', 'abandoned', 'prey', 'disenchanted', 'monstrous', 'beast', 'ritual', 'polar', 'luck', 'legions', 'shrek', 'uncharted', 'desire', 'hellraiser', 'titanic', 'maneater', 'piggy', 'x', 'scream', 'interstellar', 'blowback', 'samaritan', 'emancipation', 'whisper', 'crawlspace', 'slumberland', 'overdose', 'coco', 'watcher', 'infinite', 'memory', 'chappie', 'lou', 'rrr', 'togo', 'venus', 'ratatouille', 'tangled', 'f9', 'after', 'dog', 'deadpool', 'dune', 'psychosexual', 'twilight', 'illusion', 'men', 'up', 'nope', 'barbarian', 'fury', 'blackout', 'frozen', 'legion', 'hercules', 'bloodshot', 'inception', 'old', 'spiderman', 'countdown', 'ambulance', 'cinderella', 'joker', 'elvis', 'megan', 'thor', 'aftersun', 'carter', 'superfast', 'godzilla', 'venom', 'blacklight', 'soul', 'inexorable', 'tr', 'brothers', 'split', 'homebound', 'cars', 'creed', 'heat', 'incantation', 'matriarc

In [55]:
# input user preferences

# Assuming you have loaded the Faiss index and item_vectors dictionary (as shown in the previous steps).

# Function to get user input and find similar items.
def find_similar_items(user_input, k=5):
    # Preprocess the user input to match the format of item names in 'item_vectors'.
    preprocessed_input = preprocess_text(user_input)
    
    # Get the vector representation of the user input.
    input_vector = item_name_to_vector(preprocessed_input)
    
    if input_vector is not None:
        # Convert the input vector to a 2D NumPy array.
        input_vector = np.array([input_vector], dtype='float32')

        # Perform similarity search using the Faiss index.
        _, indices = index.search(input_vector, k+1)  # +1 to exclude the input item itself from recommendations.
        
        # Get the names of the similar items.
        similar_items = [list(item_vectors.keys())[i] for i in indices[0]]
        
        # Exclude the input item from the recommendations.
        similar_items = [item for item in similar_items if item != preprocessed_input]
        
        # Display the top-k similar items.
        print(f"Top-{k} recommendations for '{user_input}':")
        for i, item in enumerate(similar_items[:k], 1):
            print(f"{i}. {item}")
    else:
        print("Item not found or not valid. Please try again.")


In [56]:
# Sample usage:
user_input = input("Enter your preference or interest: ")
find_similar_items(user_input)


Enter your preference or interest:  action


300
Top-5 recommendations for 'action':
1. it
2. robocop
3. ouija
4. argu
5. samson


In [68]:
# we can add user profiles to the recommendation system to make it more personalized. 
# The user profiles allow users to build a history of liked items, and we can use this 
# information to improve the recommendations.

# Initialize an empty user profile dictionary to store liked items for each user.
user_profiles = {}

# Function to add items to a user's profile.
def add_item_to_profile(user_id, item_name):
    if user_id in user_profiles:
        user_profiles[user_id].append(item_name)
    else:
        user_profiles[user_id] = [item_name]

# Function to get user recommendations based on their profile.
def get_user_recommendations(user_id, k=5):
    if user_id in user_profiles:
        liked_items = user_profiles[user_id]
        
        # Convert the user's liked items to vectors.
        liked_vectors = [item_name_to_vector(item) for item in liked_items]
        liked_vectors = np.array([vector for vector in liked_vectors if vector is not None], dtype='float32')
        
        if len(liked_vectors) > 0:
            # Compute the average vector for the user's liked items.
            user_vector = np.mean(liked_vectors, axis=0)
            
            # Convert the user vector to a 2D NumPy array.
            user_vector = np.array([user_vector], dtype='float32')
            
            # Perform similarity search using the Faiss index.
            _, indices = index.search(user_vector, k)
            
            # Get the names of the similar items.
            similar_items = [list(item_vectors.keys())[i] for i in indices[0]]
            
            # Exclude items that are already in the user's profile.
            recommended_items = [item for item in similar_items if item not in liked_items]
            
            # Display the recommendations.
            print(f"Top-{k} recommendations for User {user_id}:")
            for i, item in enumerate(recommended_items, 1):
                print(f"{i}. {item}")
        else:
            print(f"User {user_id} has no liked items.")
    else:
        print(f"User {user_id} not found. Please add liked items to the user profile first.")


In [68]:
# we can add user profiles to the recommendation system to make it more personalized. 
# The user profiles allow users to build a history of liked items, and we can use this 
# information to improve the recommendations.

# Initialize an empty user profile dictionary to store liked items for each user.
user_profiles = {}

# Function to add items to a user's profile.
def add_item_to_profile(user_id, item_name):
    if user_id in user_profiles:
        user_profiles[user_id].append(item_name)
    else:
        user_profiles[user_id] = [item_name]

# Function to get user recommendations based on their profile.
def get_user_recommendations(user_id, k=5):
    if user_id in user_profiles:
        liked_items = user_profiles[user_id]
        
        # Convert the user's liked items to vectors.
        liked_vectors = [item_name_to_vector(item) for item in liked_items]
        liked_vectors = np.array([vector for vector in liked_vectors if vector is not None], dtype='float32')
        
        if len(liked_vectors) > 0:
            # Compute the average vector for the user's liked items.
            user_vector = np.mean(liked_vectors, axis=0)
            
            # Convert the user vector to a 2D NumPy array.
            user_vector = np.array([user_vector], dtype='float32')
            
            # Perform similarity search using the Faiss index.
            _, indices = index.search(user_vector, k)
            
            # Get the names of the similar items.
            similar_items = [list(item_vectors.keys())[i] for i in indices[0]]
            
            # Exclude items that are already in the user's profile.
            recommended_items = [item for item in similar_items if item not in liked_items]
            
            # Display the recommendations.
            print(f"Top-{k} recommendations for User {user_id}:")
            for i, item in enumerate(recommended_items, 1):
                print(f"{i}. {item}")
        else:
            print(f"User {user_id} has no liked items.")
    else:
        print(f"User {user_id} not found. Please add liked items to the user profile first.")


In [74]:
# Movie Recommendations by UserID Demo

user_id = input("UserID? <enter number>")
items = input("Enter item or items that you liked: ")
liked_items = items.split()

for item in liked_items:
    # Ensure the item exists in 'item_vectors' before adding it to the user's profile.
    if item in item_vectors:
        add_item_to_profile(user_id, item)
#print(user_profiles)

# list recommendation for the user
get_user_recommendations(user_id)

UserID? <enter number> 3
Enter item or items that you liked:  bumblebee anna underdogs


Top-5 recommendations for User 3:
1. scarface
2. samson
3. bambi
4. hancock
5. fiona
