# LDA Model training notebook

We use an interactive notebook to train our LDA model.

The notebook uses custom-modules defined in other files, but to prevent ourselves from re-loading the data during training, it is easier to use a notebook.

### Setup logging

In [None]:
import logging
from logging import config
config.fileConfig('./logging.conf')

### Resource paths

In [None]:
dictionary_path = './dictionary'
tf_idf_path = './tf_idf'
test_path = './dataset/chunked/test_*.bin'
data_path = './dataset/chunked/train_*.bin'
model_path = './model/grid-xxx'

### Load pre-computed resources

Dictionary and TF-IDF models are already precomputed and stored in files

In [None]:
from gensim.corpora import Dictionary
dictionary = Dictionary.load(dictionary_path)

from gensim.models import TfidfModel
tf_idf = TfidfModel.load(tf_idf_path)

### Load and pre-process the corpus (CNN/DailyMail)

Using the TF-IDF generator automatically uses the entire chain of generators: Pre-processor -> BOW -> TF-IDF

In [None]:
from generators import get_cnn_dm_article_generator, get_tf_idf_generator
cnn_dm_gen = get_cnn_dm_article_generator(data_path)
gen = get_tf_idf_generator(cnn_dm_gen)
corpus = list(gen)

### Run grid search to find best LDA model

NOTE: We actually generate and store all the models in memory, so it's easier to examining them

In [None]:
logging.info('Grid searching for best model')
params_grid = {
    'num_topics': [33, 66, 100],
    'decay': [0.85, 1],
    'passes': [2],
    'alpha': [0.05, 0.07, 0.1],
    'eta': [0.05, 0.1]
}

from lda_trainer import train_cnn_dm_models
trainers = train_cnn_dm_models(corpus, dictionary, params_grid)

### Load test data

In [None]:
from generators import get_cnn_dm_article_generator, get_bow_generator
test_cnn_dm_gen = get_cnn_dm_article_generator(test_path)
test_gen = get_bow_generator(test_cnn_dm_gen, dictionary)
test_corpus = list(test_gen)

### Evaluate models against the test data

In [None]:
perplex = [trainer.model.log_perplexity(test_corpus) for trainer in trainers]
logging.info(perplex)

### Save the best model to a file

In [None]:
best_trainer = trainers[-1] # The last model was the best one
logging.info(f'Best model [{best_trainer.params}]')
logging.info(f'Saving model [{model_path}]')
best_trainer.model.save(model_path)