In [1]:
from datasets import load_dataset
dataset = load_dataset("Pablinho/movies-dataset")
data = dataset['train'].to_pandas()

In [3]:
data.Original_Language.unique()

array(['en', 'ja', 'fr', 'hi', 'es', 'ru', 'de', 'th', 'ko', 'tr', 'cn',
       'zh', 'it', 'pt', 'ml', 'pl', 'fi', 'no', 'da', 'id', 'sv', None,
       'https://image.tmdb.org/t/p/original/6iXYe7AkQ1QIfMFuvXsSCT2zF7s.jpg',
       'nl', 'te', 'sr', 'is', 'ro', 'tl', 'fa', 'uk', 'nb', 'eu', 'lv',
       'ar', 'el', 'cs', 'ms', 'bn', 'ca', 'la', 'ta', 'hu', 'he', 'et'],
      dtype=object)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(max_features=5000, stop_words="english")

tfidf_matrix = model.fit_transform(data['Overview'].fillna(''))

#save vectorizer
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(model, f)

# load vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
model

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from models.tfidf_vectorizer import TfidfVectorizerModel
from models.embedding_model import EmbeddingModel
from utils.text_preprocessing import preprocess_text

class SearchEngine:
    def __init__(self, data):
        self.data = data
        self.tfidf_model = TfidfVectorizerModel()
        self.embedding_model = EmbeddingModel()
        
        # Prepare data
        self.data['processed_overview'] = self.data['Overview'].fillna('').apply(preprocess_text)
        self.tfidf_matrix = self.tfidf_model.fit_transform(self.data['processed_overview'])
        self.embeddings = self.embedding_model.encode(self.data['processed_overview'].tolist())

    def search(self, query, method='tfidf', top_n=10):
        query = preprocess_text(query)
        
        if method == 'tfidf':
            query_vector = self.tfidf_model.transform([query])
            scores = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        elif method == 'embedding':
            query_vector = self.embedding_model.encode([query])
            scores = cosine_similarity(query_vector, self.embeddings).flatten()
        else:
            raise ValueError("Invalid method. Choose 'tfidf' or 'embedding'.")

        top_indices = np.argsort(scores)[::-1][:top_n]
        results = self.data.iloc[top_indices]
        
        return results[['Title', 'Overview', 'Popularity', 'Vote_Average']], scores[top_indices]

ModuleNotFoundError: No module named 'sklearn'

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfVectorizerModel:
    def __init__(self, max_features=5000, stop_words='english'):
        """
        Initializes the TF-IDF Vectorizer Model.

        Args:
            max_features (int): Maximum number of features for the TF-IDF vectorizer.
            stop_words (str): Stop words to use during vectorization.
        """
        self.vectorizer = TfidfVectorizer(max_features=max_features, stop_words=stop_words)

    def fit_transform(self, corpus):
        """
        Fits the TF-IDF vectorizer to the corpus and transforms it.

        Args:
            corpus (list[str]): List of preprocessed text documents.

        Returns:
            sparse matrix: Transformed TF-IDF matrix.
        """
        return self.vectorizer.fit_transform(corpus)

    def transform(self, query):
        """
        Transforms a query using the fitted TF-IDF vectorizer.

        Args:
            query (list[str]): List containing the query text.

        Returns:
            sparse matrix: Transformed query vector.
        """
        return self.vectorizer.transform(query)

corpus = [
    "Space adventure with aliens",
    "A romantic story in Paris",
    "A thriller with a detective in London"
]

model = TfidfVectorizerModel()
tfidf_matrix = model.fit_transform(corpus)
query_vector = model.transform(["alien adventure"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print(tfidf_matrix)
print("Query vector shape:", query_vector.shape)
print(query_vector)

TF-IDF matrix shape: (3, 9)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (3, 9)>
  Coords	Values
  (0, 6)	0.5773502691896257
  (0, 0)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (1, 5)	0.5773502691896257
  (1, 7)	0.5773502691896257
  (1, 4)	0.5773502691896257
  (2, 8)	0.5773502691896257
  (2, 2)	0.5773502691896257
  (2, 3)	0.5773502691896257
Query vector shape: (1, 9)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1 stored elements and shape (1, 9)>
  Coords	Values
  (0, 0)	1.0


In [12]:
from sentence_transformers import SentenceTransformer

class EmbeddingModel:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initializes the Embedding Model using SentenceTransformers.

        Args:
            model_name (str): Name of the pre-trained model to use for embeddings.
        """
        self.model = SentenceTransformer(model_name)

    def encode(self, texts):
        """
        Encodes a list of texts into embeddings.

        Args:
            texts (list[str]): List of preprocessed text documents.

        Returns:
            np.ndarray: Array of embeddings for the input texts.
        """
        return self.model.encode(texts, convert_to_numpy=True)


texts = [
    "Space adventure with aliens",
    "A romantic story in Paris",
    "A thriller with a detective in London"
]

embedding_model = EmbeddingModel()
embeddings = embedding_model.encode(texts)

print("Embeddings shape:", embeddings.shape)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


Embeddings shape: (3, 384)


In [13]:
import psycopg2

In [10]:
tfidf_matrix[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 9)>