In [18]:
from datetime import datetime
from pathlib import Path
from gensim import corpora
from gensim.models import LdaModel, ldamulticore
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric
from itertools import product

import wandb

In [2]:
DATASET = "sample50k_topic4"

DICTIONARY_PATH = Path("../results/dictionary")
CORPUS_PATH = Path("../results/corpus")
MODEL_PATH = Path("../results/models")/DATASET

MODEL_PATH.mkdir(exist_ok=True)

In [4]:
# Log in to W&B account
wandb.login()

True

In [11]:
# load saved dictionary and corpus
dictionary = corpora.Dictionary.load(f"{DICTIONARY_PATH/DATASET}")
corpus = corpora.MmCorpus(f"{CORPUS_PATH/DATASET}.mm")

In [12]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 105408
Number of documents: 50000


### Train and save experiments to Weighths & Biases

Single core version.

In [13]:
# Set up the callbacks loggers
perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
convergence_logger = ConvergenceMetric(logger='shell', normed=True)
coherence_umass_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'u_mass')

In [20]:
num_passes = [5]
num_iterations = [10, 20, 50]
num_topics = [4, 8, 12, 24, 48]
all_combinations = list(product(num_passes, num_iterations, num_topics))

for passes, iterations, topics in all_combinations:
    
    start_time = datetime.now()
    
    config = {
        "topics": topics,
        "passes": passes,
        "iterations": iterations,
        "coherence_metric": 'u_mass',
        "model": "gensim.models.LdaModel",
        "random_state": 100
    }

    wandb.init(
        project="lda-labelled-subset",
        entity="angelika",
        name = "i{}p{}t{}".format(iterations, passes, topics),
#         notes="finding number of passes and iterations",
        tags=["sample50k_topic4"],
        config=config,
    )
   
    # Create model - note callbacks argument uses list of created callback loggers
    model = LdaModel(corpus=corpus,
             id2word=dictionary,
             num_topics=topics,
            #  eval_every=20,
             passes=passes,
             iterations=iterations,
             random_state=100,         
            callbacks=[convergence_logger, perplexity_logger, coherence_umass_logger])

    # Log metrics
    for con, coh, per in zip(model.metrics["Convergence"], model.metrics["Coherence"], model.metrics["Perplexity"]):
        wandb.log({"Convergence": con,
                  "Coherence": coh,
                  "Perplexity": per})
        
    time_elapsed = datetime.now() - start_time   
    wandb.log({"time_elapsed": str(time_elapsed)})
    
    # save model
    model_fn = f"{MODEL_PATH}/lda_{iterations}i{passes}p{topics}t.model"
    model.save(model_fn)
    wandb.log({"model_name": model_fn})

    wandb.finish()

0,1
Coherence,█▁▇▆▆
Convergence,█▁▆▂▄
Perplexity,█▃▂▁▁

0,1
Coherence,-0.94492
Convergence,3.0
Perplexity,369.19189
time_elapsed,0:12:30.407404


0,1
Coherence,█▁▁▂▂
Convergence,█▃▁▂▂
Perplexity,█▄▂▁▁

0,1
Coherence,-1.25565
Convergence,3.7188
Perplexity,390.64424
time_elapsed,0:14:38.359323


0,1
Coherence,█▇▄▁▆
Convergence,█▂▁▂▂
Perplexity,█▄▂▁▁

0,1
Coherence,-1.15181
Convergence,4.79762
Perplexity,472.92609
time_elapsed,0:14:04.721287


0,1
Coherence,█▄▃▂▁
Convergence,█▄▃▂▁
Perplexity,█▃▂▁▁

0,1
Coherence,-1.31416
Convergence,8.02264
Perplexity,658.14204
time_elapsed,0:11:50.470730


0,1
Coherence,▁▂█▆▇
Convergence,█▃▃▂▁
Perplexity,█▄▂▁▁

0,1
Coherence,-1.59232
Convergence,16.38891
Perplexity,1427.47403
time_elapsed,0:12:21.906589


0,1
Coherence,█▁▃▄▄
Convergence,█▅▃▅▁
Perplexity,█▃▂▁▁

0,1
Coherence,-0.94974
Convergence,1.50495
Perplexity,368.72348
time_elapsed,0:09:33.121717


0,1
Coherence,█▆▇▂▁
Convergence,█▃▂▁▁
Perplexity,█▄▂▁▁

0,1
Coherence,-1.27791
Convergence,2.73556
Perplexity,389.83745
time_elapsed,0:10:56.137461


0,1
Coherence,█▁▁▁▁
Convergence,█▃▂▂▁
Perplexity,█▄▂▁▁

0,1
Coherence,-1.19542
Convergence,5.82591
Perplexity,469.18376
time_elapsed,0:11:15.964848


0,1
Coherence,█▄▃▄▁
Convergence,█▂▂▂▁
Perplexity,█▄▂▂▁

0,1
Coherence,-1.33916
Convergence,6.30824
Perplexity,651.51159
time_elapsed,0:11:49.216288


0,1
Coherence,▁▁▄▇█
Convergence,█▂▁▁▁
Perplexity,█▅▃▂▁

0,1
Coherence,-1.74287
Convergence,17.95407
Perplexity,1381.0519
time_elapsed,0:15:22.459977


0,1
Coherence,█▁▁▂▃
Convergence,█▆▃▁▂
Perplexity,█▃▂▁▁

0,1
Coherence,-0.94974
Convergence,2.0099
Perplexity,368.66973
time_elapsed,0:12:46.593027


0,1
Coherence,█▂▁▂▂
Convergence,█▃▁▁▁
Perplexity,█▄▂▁▁

0,1
Coherence,-1.17062
Convergence,2.19032
Perplexity,392.49166
time_elapsed,0:13:35.767729


KeyboardInterrupt: 

Multicore version

num_passes = [5] # default 1
num_iterations = [10, 20, 50] # default 50
num_topics = [4, 8, 12, 24, 48]

# Create experiments with different number of passes and iterations
run_combinations = list(product(num_passes, num_iterations))

for passes, iterations in run_combinations:

    # Start new W&B run
    config = {
        "topics": topics,
        "passes": passes,
        "iteration": iteration,
        "coherence_metric": 'u_mass',
        "model": "models.ldamulticore.LdaMulticore",
        "workers": 5 # number of cores minus 1
    }

    wandb.init(
        project="lda-labelled-subset",
        entity="angelika",
        name = "i{}p{}t{}".format(iteration, passes, str(topics)),
        tags=["sample50k_topic4"],
        config=config
    )


    for topic in topics:
        
        start_time = datetime.now()
        
        # Multicore model does not support callbacks
        model = ldamulticore.LdaMulticore(corpus=corpus,
                id2word=dictionary,
                num_topics=topic,
                passes=passes,
                iterations=iteration,
                random_state=100,
                workers=5)

        # Calculate metrics
        cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass', topn=20)
        coherence = cm.get_coherence()

        cm2 = Co
        
        time_elapsed = datetime.now() - start_time
        
        wandb.log({"Coherence": coherence,
                "time_elapsed": round(time_elapsed.total_seconds()/60, 2),
                "num_topic": topic
                })

        # save model
        model_fn = f"{MODEL_PATH}/lda_{iterations}i{passes}p{topics}t.model"
        model.save(model_fn)    
        wandb.log({"model_name": model_fn})
        
    wandb.finish()