In [None]:
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Class Frame (no actual use)

In [None]:
# --- 1. Abstract Base Class for Interchangeability ---

class TextEmbedder(ABC):
    """
    Abstract Base Class for text embedders. It ensures that all concrete
    embedder classes implement a consistent interface (`fit` and `transform`),
    making them easily interchangeable in any ML pipeline.
    """
    @abstractmethod
    def fit(self, documents):
        """
        Learns the vocabulary or model parameters from a collection of documents.
        
        Args:
            documents (list of str): A list of text documents to train on.
        """
        pass

    @abstractmethod
    def transform(self, documents):
        """
        Converts a collection of documents into numerical embeddings.

        Args:
            documents (list of str): A list of text documents to embed.

        Returns:
            np.ndarray: A dense NumPy array of the document embeddings.
        """
        pass

    def fit_transform(self, documents):
        """

        A convenience method to first fit the model and then transform the documents.
        
        Args:
            documents (list of str): A list of text documents.

        Returns:
            np.ndarray: A dense NumPy array of the document embeddings.
        """
        self.fit(documents)
        return self.transform(documents)

# Embedder Implementations

## N-grams & TF-IDF

In [None]:
# --- 2. Finalized Embedder Implementations ---

class TfidfEmbedder(TextEmbedder):
    """
    Embeds text by calculating Term Frequency-Inverse Document Frequency (TF-IDF)
    scores for N-grams. This approach emphasizes words that are frequent in a
    document but rare across the entire corpus, making it great for keyword
    and topic identification.
    """
    def __init__(self, max_features=5000, ngram_range=(1, 2)):
        """
        Args:
            max_features (int): The maximum number of top N-grams to keep.
            ngram_range (tuple): The range of N-grams to consider (e.g., (1, 2)
                                 for unigrams and bigrams).
        """
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            stop_words='english'
        )
        print(f"Initialized TfidfEmbedder with ngram_range={ngram_range}, max_features={max_features}")

    def fit(self, documents):
        print("Fitting TF-IDF vocabulary...")
        self.vectorizer.fit(documents)
        return self

    def transform(self, documents):
        print(f"Transforming {len(documents)} documents into TF-IDF vectors...")
        # Convert the sparse matrix output to a dense numpy array
        return self.vectorizer.transform(documents).toarray()



## Averaged Word Vector

In [None]:
class AveragedWordVectorEmbedder(TextEmbedder):
    """
    Simulates pre-trained word embeddings (like GloVe or Word2Vec). Each document
    is represented by the average of its word vectors. This method captures the
    semantic "gist" of the document but discards word order.
    """
    def __init__(self, embedding_dim=100, max_features=5000):
        """
        Args:
            embedding_dim (int): The desired dimensionality of the output vectors.
            max_features (int): The vocabulary size to consider.
        """
        self.embedding_dim = embedding_dim
        self.max_features = max_features
        self.vectorizer = CountVectorizer(max_features=self.max_features, stop_words='english')
        self.embedding_matrix_ = None
        print(f"Initialized AveragedWordVectorEmbedder with embedding_dim={embedding_dim}")

    def fit(self, documents):
        print("Fitting vocabulary and creating simulated embedding matrix...")
        self.vectorizer.fit(documents)
        vocab = self.vectorizer.get_feature_names_out()
        # In a real application, you would load a pre-trained matrix.
        # Here, we simulate it with random vectors for demonstration.
        self.embedding_matrix_ = np.random.randn(len(vocab), self.embedding_dim)
        return self

    def transform(self, documents):
        if self.embedding_matrix_ is None:
            raise RuntimeError("Embedder has not been fitted yet.")
        
        print(f"Transforming {len(documents)} documents into averaged word vectors...")
        word_counts = self.vectorizer.transform(documents)
        doc_embeddings = np.zeros((word_counts.shape[0], self.embedding_dim))
        
        for i, doc_vector in enumerate(word_counts):
            word_indices = doc_vector.indices
            if len(word_indices) > 0:
                # Average the vectors of the words present in the document
                doc_embeddings[i] = self.embedding_matrix_[word_indices].mean(axis=0)
                
        return doc_embeddings



## Hashing SVD (dim reduction)

In [None]:
class HashingSvdEmbedder(TextEmbedder):
    """
    A memory-efficient and scalable approach that first uses the "hashing trick"
    to map N-grams to a fixed-size feature space, then applies Truncated SVD
    (a form of PCA for sparse data) to create dense, lower-dimensional "topic"
    vectors.
    """
    def __init__(self, n_components=100, n_features=2**12, ngram_range=(1, 2)):
        """
        Args:
            n_components (int): The final embedding dimension after SVD reduction.
            n_features (int): The size of the hashing space.
            ngram_range (tuple): The range of N-grams to consider.
        """
        self.pipeline = make_pipeline(
            HashingVectorizer(n_features=n_features, ngram_range=ngram_range, stop_words='english'),
            TruncatedSVD(n_components=n_components, random_state=42)
        )
        print(f"Initialized HashingSvdEmbedder with n_components={n_components}")

    def fit(self, documents):
        print("Fitting the Hashing+SVD pipeline...")
        self.pipeline.fit(documents)
        return self

    def transform(self, documents):
        print(f"Transforming {len(documents)} documents with Hashing+SVD...")
        return self.pipeline.transform(documents)


# DEMO

In [None]:
# --- 3. Demonstration of Interchangeability ---

if __name__ == '__main__':
    # Sample data for your recommendation system project
    game_and_movie_docs = [
        "A tactical first-person shooter with a focus on teamwork.",
        "Epic fantasy adventure with dragons and mighty magic.",
        "A sci-fi thriller set in a grim, dystopian future.",
        "Players explore a vast open world in this fantasy RPG.",
        "This movie is a thriller about a deep-sea mystery.",
    ]

    # A dictionary of embedders to test
    embedders_to_test = {
        "TF-IDF": TfidfEmbedder(max_features=50, ngram_range=(1, 1)),
        "Averaged Word Vectors": AveragedWordVectorEmbedder(embedding_dim=64),
        "Hashing + SVD": HashingSvdEmbedder(n_components=32, n_features=1024)
    }

    print("\n" + "="*50)
    print("DEMONSTRATING INTERCHANGEABLE EMBEDDERS")
    print("="*50 + "\n")

    for name, embedder in embedders_to_test.items():
        print(f"--- Processing with: {name} ---")
        
        # The fit_transform call is identical for all embedders
        embeddings = embedder.fit_transform(game_and_movie_docs)
        
        print(f"Output embedding shape: {embeddings.shape}")
        # Verify that the output is a dense NumPy array
        assert isinstance(embeddings, np.ndarray)
        assert embeddings.ndim == 2
        assert embeddings.shape[0] == len(game_and_movie_docs)
        print("Successfully produced a dense NumPy array.\n")