In [1]:
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
import warnings

class BaseWordEmbedding:
    def __init__(self, n_base_words=200, projection_type='random'):
        """
        Initialize embedding system using either random projection
        or explicit base word dimensions.
        
        Parameters:
        -----------
        n_base_words : int
            Number of dimensions/base words to use
        projection_type : str
            'random' for standard random projection
            'base_words' to use specific words as dimensions
        """
        self.n_dimensions = n_base_words
        self.projection_type = projection_type
        self.projection_matrix = None
        self.base_words = None
        self.base_word_vectors = None
        
    def fit(self, dtm, words, base_words=None):
        """
        Fit the projection system.
        
        Parameters:
        -----------
        dtm : scipy.sparse.csr_matrix
            The input DTM with trigram-based word representations
        words : list
            List of words corresponding to DTM rows
        base_words : list, optional
            List of words to use as base dimensions. Required if 
            projection_type is 'base_words'
        """
        if self.projection_type == 'random':
            # Standard random projection matrix
            input_dims = dtm.shape[1]
            self.projection_matrix = np.random.normal(
                0, 1/np.sqrt(self.n_dimensions), 
                (input_dims, self.n_dimensions)
            )
            
        elif self.projection_type == 'base_words':
            if base_words is None or len(base_words) != self.n_dimensions:
                raise ValueError("Must provide exactly n_base_words base words")
                
            # Store base words and their indices
            self.base_words = base_words
            word_to_idx = {word: idx for idx, word in enumerate(words)}
            base_indices = [word_to_idx[word] for word in base_words]
            
            # Extract base word vectors
            self.base_word_vectors = normalize(
                dtm[base_indices].toarray(), 
                norm='l2', 
                axis=1
            )
            
    def transform(self, vectors):
        """
        Project vectors into the reduced space.
        
        Parameters:
        -----------
        vectors : scipy.sparse.csr_matrix or numpy.ndarray
            Vectors to project
            
        Returns:
        --------
        numpy.ndarray
            Projected vectors
        """
        if isinstance(vectors, csr_matrix):
            vectors = vectors.toarray()
            
        if self.projection_type == 'random':
            projected = vectors @ self.projection_matrix
            
        else:  # base_words
            # Project onto base word vectors
            projected = vectors @ self.base_word_vectors.T
            
        return normalize(projected, norm='l2', axis=1)
    
    def get_dimension_interpretation(self, n_contributing_words=5):
        """
        For base word projection, returns the base word for each dimension.
        For random projection, returns words that contribute most to each dimension.
        
        Returns:
        --------
        list of str or list of list of (str, float)
            Interpretation of each dimension
        """
        if self.projection_type == 'base_words':
            return self.base_words
        else:
            # For random projection, find words that align most with each dimension
            interpretations = []
            for dim_idx in range(self.n_dimensions):
                dim_weights = self.projection_matrix[:, dim_idx]
                top_indices = np.argsort(np.abs(dim_weights))[-n_contributing_words:]
                weights = dim_weights[top_indices]
                interpretations.append([
                    (f"Feature_{i}", float(w))
                    for i, w in zip(top_indices, weights)
                ])
            return interpretations

    def interpret_embedding(self, embedding, n_top=5):
        """
        Interpret what a given embedding represents in terms of
        base dimensions or random projection components.
        
        Parameters:
        -----------
        embedding : numpy.ndarray
            The embedding vector to interpret
        n_top : int
            Number of top contributing dimensions to return
            
        Returns:
        --------
        list of tuple
            (dimension_name, contribution) pairs
        """
        # Get absolute contributions
        contributions = np.abs(embedding)
        top_indices = np.argsort(contributions)[-n_top:][::-1]
        
        if self.projection_type == 'base_words':
            return [(self.base_words[idx], float(embedding[idx]))
                    for idx in top_indices]
        else:
            return [(f"Dimension_{idx}", float(embedding[idx]))
                    for idx in top_indices]

# Example usage
def demonstrate_usage():
    # Sample data
    vocabulary = ["cat", "dog", "fish", "bird", "hamster"]
    dtm = np.random.rand(5, 1000)  # 5 words, 1000 trigram features
    dtm_sparse = csr_matrix(dtm)
    
    # Random projection
    random_embedder = BaseWordEmbedding(n_base_words=3, projection_type='random')
    random_embedder.fit(dtm_sparse, vocabulary)
    random_embeddings = random_embedder.transform(dtm_sparse)
    
    # Base word projection
    base_embedder = BaseWordEmbedding(n_base_words=3, projection_type='base_words')
    base_embedder.fit(dtm_sparse, vocabulary, base_words=['cat', 'dog', 'fish'])
    base_embeddings = base_embedder.transform(dtm_sparse)
    
    return random_embeddings, base_embeddings