In [18]:
import pickle
from bertopic import BERTopic
import numpy as np
from Mapping.map_topic_embedding import map_embeddings
from Metrics.mms import average_mapped_improvement

In [2]:
# Single “source of truth” for optimal topic counts:
OPTIMAL_LDA_TOPICS = {
    'wsj': 50,
    'wiki': 80,
    '20ng': 70
}

# Placeholder for future Bertopic‐specific topic counts
OPTIMAL_BERTOPIC_TOPICS = {
    'wsj': 50,
    'wiki':70,
    '20ng':40
}

In [19]:
def sample_columns(arr: np.ndarray, n: int, random_state=None) -> np.ndarray:
    """
    Randomly sample n columns from a 2D array.

    Parameters
    ----------
    arr : np.ndarray
        Input array of shape (m, p).
    n : int
        Number of columns to sample (n <= p).
    random_state : int or np.random.Generator, optional
        Seed or Generator for reproducibility.

    Returns
    -------
    np.ndarray
        Sub-array of shape (m, n) containing the sampled columns.
    """
    m, p = arr.shape
    if not 0 < n <= p:
        raise ValueError(f"n must be between 1 and {p}, got {n}")

    # Create a random number generator
    rng = np.random.default_rng(random_state)

    # Choose n distinct column indices
    cols = rng.choice(p, size=n, replace=False)

    # Return the sub-array with those columns
    return arr[:, cols]


In [14]:
def sampling_pipeline(corpus, lang_model, topic_model, feature_range, n_bootstraps):
    
    # load embedding and get topic model paths
    embedding = pickle.load(open(f'Processe{corpus.upper()}/{corpus}_{lang_model}_corpus_embeddings.pkl', 'rb'))
    if topic_model == 'lda':
        dim = OPTIMAL_LDA_TOPICS[corpus]
    else:
        dim = OPTIMAL_BERTOPIC_TOPICS[corpus]
    topics_path = f'Results/LDA/{corpus}_topic_doc_matrix_{dim}.pkl'
    topics = pickle.load(open(topics_path,'rb'))
     
    # set up feature range
    start_feat = 10
    end_feat = feature_range if feature_range <= embedding.shape[1] else embedding.shape[1]
    
    # start n_feature iteration
    mms_dict = {}
    for n in range(start_feat, end_feat, 10):
        feature_result_list = []
        
        for i in range(n_bootstraps):
            sample = sample_columns(embedding, n, random.random())
            mapping = map_embeddings(
                dataset=corpus,
                lang_model=lang_model,
                topic_model=topic_model,
                dim=dim,
                features=sample,
                topics=topics,# or None
                output_dir='Results',                     # where to save if needed
                save=False                                # compute only, no file write
            )
            
            avg_imp = average_mapped_improvement(
                topic_model=topic_model,   # or 'lda'
                dataset=corpus,           # e.g. 'wiki', '20ng', 'wsj'
                n_topics=dim,              # the number of topics you used
                mapping=mapping,          # your mapping object
                threshold_mode='gmm',     # optional, defaults to 'gmm'
                specificity_mode='diff'   # optional, defaults to 'diff'
            )
            print(f"Average mapped specificity improvement: {avg_imp:.4f}")
            feature_result_list.append(avg_imp)
        mms_dict[n] = feature_result_list
            
    return mms_dict 
            
    

In [15]:
embedding = pickle.load(open('ProcessedWIKI/wiki_repllama_corpus_embeddings.pkl','rb'))

In [32]:
features = sample_columns(embedding, 4096)
topics_path = 'Results/LDA/wiki_topic_doc_matrix_80.pkl'
topics = pickle.load(open(topics_path,'rb'))

In [33]:
mapping = map_embeddings(
                dataset='wiki',
                lang_model='repllama',
                topic_model='lda',
                dim=80,
                features=features,
                topics=topics,# or None
                output_dir='Results',                     # where to save if needed
                save=False                                # compute only, no file write
            )

In [34]:
avg_imp = average_mapped_improvement(
    topic_model='lda',   # or 'lda'
    dataset='wiki',           # e.g. 'wiki', '20ng', 'wsj'
    n_topics=80,              # the number of topics you used
    mapping=mapping,          # your mapping object
    threshold_mode='gmm',     # optional, defaults to 'gmm'
    specificity_mode='diff'   # optional, defaults to 'diff'
)

print(f"Average mapped specificity improvement: {avg_imp:.4f}")

Average mapped specificity improvement: 0.0714


In [35]:
wiki = pickle.load(open('ProcessedWIKI/wiki_raw.pkl','rb'))

In [37]:
wiki[100]

'The  circuit of culture  is a theory or framework used in the area of  cultural studies .\nThe theory was devised in 1997 by a group of theorists when studying the  Walkman  cassette player. The theory suggests that in studying a cultural text or artifact you must look at five aspects: its representation,  identity , production, consumption and regulation. Du Gay et al. suggest that "taken together (these 5 points) complete a sort of circuit...through which any analysis of a cultural text...must pass if it is to be adequately studied."  &#91;1&#93;\nGerard Goggin  openly uses this framework in his book  Cell Phone Culture: Mobile technology in everyday life  in order to fully understand the cell phone as a  cultural artifact . His book is split into four parts: production, consumption, regulation, and representation and identity (through looking at mobile convergences).  &#91;2&#93;\n'