In [1]:
!pip install pandas sentence_transformers redis
!pip install --upgrade redis





Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting redis
  Downloading redis-4.6.0-py3-none-any.whl (241 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.1/241.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m136.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h

In [2]:
!unzip /data/IMDb_Data_final.zip

Archive:  /data/IMDb_Data_final.zip
  inflating: IMDb_Data_final.csv     


In [5]:
import pandas as pd
import redis
import numpy as np
from redis.commands.search import Search
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer

redis_host = 'redis-18547.c305.ap-south-1-1.ec2.cloud.redislabs.com'
redis_port = 18547
redis_password = ''

class MovieRecommender:
    def __init__(self, redis_host, redis_port, redis_password, csv_file_path, model_name):
        """
        Initialize the MovieRecommender class with the necessary parameters.

        Args:
        redis_host (str): The host address of the Redis server.
        redis_port (int): The port number of the Redis server.
        redis_password (str): The password to connect to the Redis server.
        csv_file_path (str): The path to the CSV file containing movie data.
        model_name (str): The name of the SentenceTransformer model to use for encoding.
        """
        self.redis_host = redis_host
        self.redis_port = redis_port
        self.redis_password = redis_password
        self.csv_file_path = csv_file_path
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)
        self.redis_client = redis.Redis(
            host=self.redis_host,
            port=self.redis_port,
            password=self.redis_password
        )

    def load_data(self):
        """
        Load the movie data from the CSV file and preprocess it.
        The data is formatted and encoded into vectors using the SentenceTransformer model.
        """
        self.movie_df = pd.read_csv(self.csv_file_path)
        self.movie_df = self.movie_df.dropna(subset=['Category', 'Duration', 'Censor-board-rating'])
        self.movie_df.reset_index(inplace=True)
        self.movie_df.drop(["index"],axis=1,inplace=True)
        self.movie_df['formatted_text'] = (
            "The movie " + self.movie_df['Title'] + " directed by " + self.movie_df['Director'] +
            " starring " + self.movie_df['Stars'] + " has an IMDb rating of " + self.movie_df['IMDb-Rating'].astype(str) +
            " under the category " + self.movie_df['Category'] + " with a duration of " + self.movie_df['Duration'] +
            ". It has a censor board rating of " + self.movie_df['Censor-board-rating'] +
            " and was released in the year " + self.movie_df['ReleaseYear'].astype(str) + "."
        )
        self.movie_df['formatted_text'] = self.movie_df['formatted_text'].apply(lambda x: x.lower())
        self.movie_df['row_vector'] = self.movie_df['formatted_text'].apply(lambda x: self.model.encode(x))

    def load_vectors(self, _df, vector_data, vector_field_name):
        """
        Load the vectors into the Redis server.

        Args:
        _df (DataFrame): The DataFrame containing the movie data.
        vector_data (list): The list of vector data.
        vector_field_name (str): The name of the field to store the vector data.
        """
        p = self.redis_client.pipeline(transaction=False)
        for index, row in _df.iterrows():
            key = 'title_id:' + str(index)
            _data_mapping = {
                "Title": row['Title'],
                "Director": row['Director'],
                "Stars": row['Stars'],
                "Rating": row['IMDb-Rating'],
                "Category": row['Category'],
                "Duration": row['Duration'],
                "Censor_Rating": row['Censor-board-rating'],
                "ReleaseYear": row['ReleaseYear'],
                vector_field_name: vector_data[index].astype(np.float32).tobytes()
            }
            p.hset(key, mapping=_data_mapping)
        p.execute()

    def create_hnsw_index(self, vector_field_name, number_of_vectors, vector_dimensions, distance_metric='COSINE', M=40, EF=200, EF_RUNTIME=100, EPSILON=0.2):
        """
        Create the HNSW index in the Redis server.

        Args:
        vector_field_name (str): The name of the field to store the vector data.
        number_of_vectors (int): The number of vectors to store in the index.
        vector_dimensions (int): The number of dimensions of the vectors.
        distance_metric (str): The distance metric to use for the index.
        M (int): The M parameter for the HNSW index.
        EF (int): The EF parameter for the HNSW index.
        EF_RUNTIME (int): The EF_RUNTIME parameter for the HNSW index.
        EPSILON (float): The EPSILON parameter for the HNSW index.
        """
        create_command = [
            "FT.CREATE", "idx", "SCHEMA",
            "Title", "TAG",
            "Director", "TEXT",
            "Stars", "TEXT",
            "Rating", "NUMERIC",
            "Category", "TEXT",
            "Duration", "TAG",
            "Censor_Rating", "TAG",
            "ReleaseYear", "NUMERIC",
            vector_field_name, "VECTOR", "HNSW","16","DIM", str(vector_dimensions),
            "TYPE", "FLOAT32",
            "DISTANCE_METRIC", str(distance_metric),
            "INITIAL_CAP", str(number_of_vectors),
            "M", M,
            "EF_CONSTRUCTION", EF,
            "EF_RUNTIME", EF_RUNTIME,
            "EPSILON", EPSILON
        ]
        self.redis_client.execute_command(*create_command)

    def load_and_index_data(self, num_movies=800, vector_field_name='movie_vector', distance_metric='COSINE', dimensions=384):
        """
        Load the data and index it in the Redis server.

        Args:
        num_movies (int): The number of movies to load and index.
        vector_field_name (str): The name of the field to store the vector data.
        distance_metric (str): The distance metric to use for the index.
        dimensions (int): The number of dimensions of the vectors.
        """
        self.redis_client.flushall()
        self.create_hnsw_index(vector_field_name, num_movies, dimensions, distance_metric)
        movie_vec = self.movie_df['row_vector'].tolist()
        self.load_vectors(self.movie_df.head(num_movies), movie_vec[0:num_movies], vector_field_name)

    def search(self, user_query, top_k=4, dist_field_name="__movie_vector_score"):
        """
        Search the index for the most similar movies to the user's query.

        Args:
        user_query (str): The user's query.
        top_k (int): The number of results to return.
        dist_field_name (str): The name of the field to store the distance scores.

        Returns:
        docs (list): The list of most similar movies.
        """
        e = self.model.encode(user_query.lower())
        q = (
            Query(f'*=>[KNN $K @movie_vector $BLOB AS {dist_field_name}]')
            .return_fields('Title','Director','Stars','Rating', 'Category','Duration','Censor_Rating','ReleaseYear')
            .paging(0, top_k)
            .sort_by(dist_field_name)
            .dialect(2)
        )
        params_dict = {"K": top_k + 1, "BLOB": e.astype(np.float32).tobytes()}
        docs = self.redis_client.ft().search(q, params_dict)
        return docs

# Data Loading and Indexing
def load_and_index():
    # Note: Replace these values with your actual Redis server details and CSV file path
    # redis_host = 'redis-17820.c301.ap-south-1-1.ec2.cloud.redislabs.com'
    # redis_port = 17820
    # redis_password = ''
    csv_file_path = '/content/IMDb_Data_final.csv'
    model_name = 'sentence-transformers/all-MiniLM-L12-v1'

    recommender = MovieRecommender(redis_host, redis_port, redis_password, csv_file_path, model_name)
    recommender.load_data()
    recommender.load_and_index_data()

# Search
def search(user_query, top_k=4):
    # Note: Replace these values with your actual Redis server details and CSV file path
    # redis_host = 'redis-17820.c301.ap-south-1-1.ec2.cloud.redislabs.com'
    # redis_port = 17820
    # redis_password = ''
    csv_file_path = '/content/IMDb_Data_final.csv'
    model_name = 'sentence-transformers/all-MiniLM-L12-v1'

    recommender = MovieRecommender(redis_host, redis_port, redis_password, csv_file_path, model_name)
    docs = recommender.search(user_query, top_k)

    # print results
    for doc in docs.docs:
        print(doc)

# Load and index the data
load_and_index()

# User query
user_query="i want to watch movies under comedy category with imdb rating of 7 and above"
# Search for the most similar movies
search(user_query, top_k=4)


Document {'id': 'title_id:528', 'payload': None, 'Title': 'The King of Comedy', 'Director': 'MartinScorsese', 'Stars': 'RobertDeNiro, JerryLewis, DiahnneAbbott, SandraBernhard', 'Rating': '7.8', 'Category': 'Comedy,Crime,Drama', 'Duration': '109min', 'Censor_Rating': 'U', 'ReleaseYear': '1982'}
Document {'id': 'title_id:15', 'payload': None, 'Title': 'The Goonies', 'Director': 'RichardDonner', 'Stars': 'SeanAstin, JoshBrolin, JeffCohen, CoreyFeldman', 'Rating': '7.7', 'Category': 'Adventure,Comedy,Family', 'Duration': '114min', 'Censor_Rating': 'U', 'ReleaseYear': '1985'}
Document {'id': 'title_id:79', 'payload': None, 'Title': 'The Breakfast Club', 'Director': 'JohnHughes', 'Stars': 'EmilioEstevez, JuddNelson, MollyRingwald, AllySheedy', 'Rating': '7.8', 'Category': 'Comedy,Drama', 'Duration': '97min', 'Censor_Rating': 'UA', 'ReleaseYear': '1985'}
Document {'id': 'title_id:468', 'payload': None, 'Title': 'Being John Malkovich', 'Director': 'SpikeJonze', 'Stars': 'JohnCusack, CameronDi

In [11]:
user_query="i want to watch movies under comedy category with imdb rating of 7 and above"
# Search for the most similar movies
search(user_query, top_k=4)

Document {'id': 'title_id:528', 'payload': None, 'Title': 'The King of Comedy', 'Director': 'MartinScorsese', 'Stars': 'RobertDeNiro, JerryLewis, DiahnneAbbott, SandraBernhard', 'Rating': '7.8', 'Category': 'Comedy,Crime,Drama', 'Duration': '109min', 'Censor_Rating': 'U', 'ReleaseYear': '1982'}
Document {'id': 'title_id:15', 'payload': None, 'Title': 'The Goonies', 'Director': 'RichardDonner', 'Stars': 'SeanAstin, JoshBrolin, JeffCohen, CoreyFeldman', 'Rating': '7.7', 'Category': 'Adventure,Comedy,Family', 'Duration': '114min', 'Censor_Rating': 'U', 'ReleaseYear': '1985'}
Document {'id': 'title_id:79', 'payload': None, 'Title': 'The Breakfast Club', 'Director': 'JohnHughes', 'Stars': 'EmilioEstevez, JuddNelson, MollyRingwald, AllySheedy', 'Rating': '7.8', 'Category': 'Comedy,Drama', 'Duration': '97min', 'Censor_Rating': 'UA', 'ReleaseYear': '1985'}
Document {'id': 'title_id:468', 'payload': None, 'Title': 'Being John Malkovich', 'Director': 'SpikeJonze', 'Stars': 'JohnCusack, CameronDi

In [8]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.18.2
Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [10]:
from ipywidgets import interact_manual

# Initialize the recommender system
# load_and_index()

def interactive_search():
    @interact_manual
    def get_input(user_query='Enter your preference here...', top_k=4):
        results = search(user_query, top_k)
        print(results)
        # for result in results.docs:
        #     print(result)

interactive_search()


interactive(children=(Text(value='Enter your preference here...', description='user_query'), IntSlider(value=4…