In [3]:
import numpy as np
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import normalize
from sklearn.random_projection import GaussianRandomProjection
import umap

def reduce_dimensions(dtm, n_components=200, method='svd', normalize_output=True):
    """
    Reduce dimensionality of a DTM using various methods.
    
    Parameters:
    -----------
    dtm : scipy.sparse.csr_matrix or numpy.ndarray
        The input DTM with trigram-based word representations
    n_components : int
        Number of dimensions in the reduced space
    method : str
        Reduction method: 'svd', 'nmf', 'random_projection', or 'umap'
    normalize_output : bool
        Whether to L2-normalize the output vectors
        
    Returns:
    --------
    reduced_dtm : numpy.ndarray
        The reduced dimensionality representation
    """
    
    # Initialize the appropriate reduction method
    if method == 'svd':
        # SVD is often the best choice for text data
        reducer = TruncatedSVD(n_components=n_components, random_state=42)
        
    elif method == 'nmf':
        # NMF for non-negative data, preserves interpretability
        reducer = NMF(n_components=n_components, random_state=42)
        
    elif method == 'random_projection':
        # Fast and memory-efficient, but less accurate
        reducer = GaussianRandomProjection(n_components=n_components, random_state=42)
        
    elif method == 'umap':
        # Better preserves local structure, good for visualization
        reducer = umap.UMAP(n_components=n_components, random_state=42)
        
    else:
        raise ValueError("Unsupported method. Choose 'svd', 'nmf', 'random_projection', or 'umap'")
    
    # Perform the reduction
    reduced_dtm = reducer.fit_transform(dtm)
    
    # Normalize if requested
    if normalize_output:
        reduced_dtm = normalize(reduced_dtm, norm='l2', axis=1)
    
    return reduced_dtm

def evaluate_reduction(original_dtm, reduced_dtm, n_neighbors=5):
    """
    Evaluate the quality of dimensionality reduction by comparing
    nearest neighbors before and after reduction.
    
    Returns the average neighbor overlap percentage.
    """
    from sklearn.neighbors import NearestNeighbors
    
    # Fit nearest neighbors on original data
    nn_orig = NearestNeighbors(n_neighbors=n_neighbors)
    nn_orig.fit(original_dtm)
    neighbors_orig = nn_orig.kneighbors(original_dtm, return_distance=False)
    
    # Fit nearest neighbors on reduced data
    nn_reduced = NearestNeighbors(n_neighbors=n_neighbors)
    nn_reduced.fit(reduced_dtm)
    neighbors_reduced = nn_reduced.kneighbors(reduced_dtm, return_distance=False)
    
    # Calculate overlap
    overlap = np.mean([
        len(set(orig) & set(red)) / n_neighbors
        for orig, red in zip(neighbors_orig, neighbors_reduced)
    ])
    
    return overlap

In [4]:
# Example usage
dtm=""
reduced_vectors = reduce_dimensions(
    dtm,
    n_components=200,
    method='svd',
    normalize_output=True
)

# Evaluate the quality of the reduction
quality_score = evaluate_reduction(your_dtm, reduced_vectors)
print(f"Neighbor preservation score: {quality_score:.2f}")

ValueError: Expected 2D array, got scalar array instead:
array=.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.