# Chapter 1

In this chapter we investigate a number of basic questions

1. The ability of LDA to generalise to increasinly large numbers of topics, versus a mixture of multinomials, by comparing MoM/VB and MoM/Gibbs with LDA/VB and LDA/Gibbs
2. The ability of the CVB and CVB0 algorithm to outperform the LDA/VB and be competitive with LDA/Gibbs algorithm on a the same three datasets as (1)
3. The comparison of three concentration parameters with HDP on one dataset, verse the LDA/VB spread on all
4. The effectiveness of the different evaluation techniques on 4-news using LDA/VB and MoM/VB
5. Perhaps a comparison of the impact of different ways of estimating perplexity

### Outstanding issues


1. [x] Locate files
1. [x] Find dictionary for NIPS
1. [x] Does every NIPS feature matrix have an intercept. If so, where is it?
1. [ ] What is the "4" variant of NIPS
1. [x] Regen Reuters corpora and verify (with tags and doc-ids)
1. [x] Regen 20-news corpus and verify
1. [x] Regen NIPS corpus excluding references
1. [ ] Regen ACL
1. [ ] Find dictionary for Arxiv (or regen it)
1. [x] Find right version of 20News 
1. [x] See if the IDs are really necessary to align the various matrices
1. [x] Figure out ACL vs Arxiv
 * The trick is using symbolic links to make ACL corpus pipeline run on the extracted Arxiv data. It's quite rough.
 * No word on NIPs however
 * Also ACL feature extraction was terrible
 


# Prelude

## Logging

In [18]:
import logging

_ = logging.getLogger() # In Jupyter you need at least one throwaway instance
logging.basicConfig(    # for basicConfig changes to stick, not sure about cmdline
    format='%(asctime)s  %(levelname)-7s %(module)s::%(funcName)s() - %(message)s',
    level=logging.INFO
)

## Imports

In [14]:
import sys
import pathlib

In [2]:
sys.path.append(str(pathlib.Path.cwd().parent))

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as ssp
import matplotlib.pyplot as plt
import seaborn as sns
from typing import NamedTuple, Any, List, Dict, Set, Union
import pickle as pkl
# import _pickle as cpkl
import six; 
from six.moves import cPickle as cpkl
import gzip
import pandas as pd
import logging
from IPython.display import display

%matplotlib inline

## Configuration

In [4]:
DATASET_DIR = pathlib.Path('/') / 'Volumes' / 'DatasetSSD'

In [6]:
class LabelledMatrix:
    values: Union[ssp.csr_matrix, np.ndarray]
    labels: Any = None
        
    def __init__(self, values, labels=None):
        self.values = values
        self.labels = labels
        
        if self.values is not None and self.labels is not None:
            assert(self.values.shape[1] == len(self.labels)), \
                f"Matrix has shape {self.values.shape} but labels has length {len(self.labels)}"
            
    def __str__(self):
        l = "" if self.labels is None else "(labelled)"
        return f"[{self.values.shape}]{l}"
    
class RawData(NamedTuple):
    words: LabelledMatrix
    feats: LabelledMatrix = None
    cites: LabelledMatrix = None
    authors: LabelledMatrix = None
    categories: LabelledMatrix = None
    row_labels: List[str] = None
        
    def __str__(self):
        return f"RawData(words{self.words}, feats{self.feats}, cites{self.cites}, authors{self.authors}, " \
               f"categories{self.categories}, row_labels[{'' if self.row_labels is None else len(self.row_labels)}]"

# Data Clearning

## Ensure we have Category Information for all Datasets

In [11]:
def flatten(l: List[List[Any]]) -> List[Any]:
    return [item for sublist in l for item in sublist]

In [26]:
def read_metadata(nltk_metadata_file: pathlib.Path, fileIds: List[str]) -> List[List[str]]:
    if nltk_metadata_file.exists():
        logging.info(f"Reading metadata from {nltk_metadata_file}")
        row_cats = {}
        with open(nltk_metadata_file, 'r') as f:
            for line in f:
                vals = line.split(' ')
                row_cats[vals[0]] = vals[1:]
            return flatten(row_cats[word_row_id] for word_row_id in fileIds)
    else:
        logging.info(f"No such metadata file {nltk_metadata_file}")

In [29]:
def create_pickled_id_cat_word_metadata(
    nltk_input_dir: pathlib.Path,
    processed_input_dir: pathlib.Path,
    output_input_dir: pathlib.Path = None) -> None:
    if output_input_dir is None:
        output_input_dir = processed_input_dir
    
    logging.info("Reading in IDs and word dictionary")
    exec(open(processed_input_dir / "ids-words.py").read())
    exec(open(processed_input_dir / "words.py").read())

    logging.info("Writing out row IDs and dictionary entries")
    with open(processed_input_dir / 'fileIds.pkl', 'wb') as f:
        pkl.dump(locals().get('fileIds'), f)
    with open(processed_input_dir / 'words_dict.pkl', 'wb') as f:
        pkl.dump(locals().get('words_dict'), f)
        
    row_cats = read_metadata(nltk_input_dir / 'cats.txt', locals().get('fileIds'))
    if row_cats:
        with open(processed_input_dir / 'cats.pkl', 'wb') as f:
            pkl.dump(row_cats, f)
    
    row_authors = read_metadata(nltk_input_dir / 'authors.txt', locals().get('fileIds'))
    if row_authors:
        with open(processed_input_dir / 'authors.pkl', 'wb') as f:
            pkl.dump(row_authors, f)
 

### Twenty News

In [30]:
create_pickled_id_cat_word_metadata(
    nltk_input_dir=pathlib.Path.home() / 'Downloads' / 'words-only' / 'Raw Data' / 'TwentyNewsClean',
    processed_input_dir=DATASET_DIR / 'words-only' / '20news4'
)

In [31]:
create_pickled_id_cat_word_metadata(
    nltk_input_dir=pathlib.Path.home() / 'Downloads' / 'words-only' / 'Raw Data' / 'reuters',
    processed_input_dir=DATASET_DIR / 'words-only' / 'reuters'
)

In [32]:
create_pickled_id_cat_word_metadata(
    nltk_input_dir=pathlib.Path.home() / 'Downloads' / 'words-only' / 'Raw Data' / 'nips',
    processed_input_dir=DATASET_DIR / 'words-only' / 'nips'
)

In [None]:
TWENTY_NEWS_DIR = DATASET_DIR / '20news4'
TWENTY_NEWS_WORDS_FILE = TWENTY_NEWS_DIR / 'words.pkl'
TWENTY_NEWS_DICT_FILE = TWENTY_NEWS_DIR / 'dict.pkl'

# Create categories.pkl if it doesn't exist

for p in [TWENTY_NEWS_WORDS_FILE, TWENTY_NEWS_DICT_FILE]:
    assert p.exists(), p
    


In [None]:
def twenty_news() -> RawData:
    words = cpkl.load(gzip.GzipFile(TWENTY_NEWS_WORDS_FILE, 'rb'))
    with open(TWENTY_NEWS_DICT_FILE, 'rb') as f:
        dic = pkl.load(f)
    return RawData(words=LabelledMatrix(values=words, labels=dic))

_t = twenty_news()
str(_t)

In [None]:
ACL_DIR = DATASET_DIR / 'ACL' / 'ACL.100.clean'
ACL_WORDS_FILE = ACL_DIR / 'words-freq.pkl'
ACL_DICT_FILE = ACL_DIR / 'words-freq-dict.pkl'
ACL_REF_FILE = ACL_DIR / 'ref.pkl'
ACL_FEATS_FILE = ACL_DIR / 'feats.pkl'
ACL_FEATS_DICT_FILE = ACL_DIR / 'feats_dict.pkl'
ACL_DOC_IDS_FILE = ACL_DIR / 'doc_ids.pkl'

for p in [ACL_WORDS_FILE, ACL_DICT_FILE, ACL_REF_FILE, ACL_FEATS_FILE, ACL_FEATS_DICT_FILE]:
    assert p.exists(), p

In [None]:
def acl() -> RawData:
    with open(ACL_WORDS_FILE, 'rb') as f:
        words = pkl.load(f)
    with open(ACL_DICT_FILE, 'rb') as f:
        words_dict = pkl.load(f)
        
    with open(ACL_REF_FILE, 'rb') as f:
        refs = pkl.load(f)
        
    with open(ACL_FEATS_FILE, 'rb') as f:
        feats = pkl.load(f)
    with open(ACL_FEATS_DICT_FILE, 'rb') as f:
        feats_dict = pkl.load(f)
        
    with open (ACL_DOC_IDS_FILE, 'rb') as f:
        doc_ids = pkl.load(f)
        
    assert words.shape[0] == refs.shape[0] == feats.shape[0] == len(doc_ids)
    
    return RawData(
        words=LabelledMatrix(values=words, labels=words_dict),
        feats=LabelledMatrix(values=feats, labels=feats_dict),
        cites=LabelledMatrix(values=refs, labels=doc_ids),
        row_labels=doc_ids
    )

_a = acl()
str(_a)

In [None]:
ARXIV_DIR = DATASET_DIR / 'Arxiv'
ARXIV_CITES_FILE = ARXIV_DIR / 'cites.pkl'
ARXIV_CITES_DICT_FILE = ARXIV_DIR / 'cites.py.pkl'
ARXIV_WORDS_FILE = ARXIV_DIR / 'words.pkl'
ARXIV_FEATS_FILE = ARXIV_DIR / 'feats.pkl'
ARXIV_FEATS_DICT_FILE = ARXIV_DIR / 'feats.py.pkl'
ARXIV_DOC_IDS_FILE = ARXIV_DIR / 'doc_ids.pkl'
ARXIV_DICT_FILE = None

for p in [ARXIV_CITES_FILE, ARXIV_WORDS_FILE, ARXIV_FEATS_FILE, ARXIV_DICT_FILE]:
    assert p.exists(), p

In [None]:
def arxiv() -> RawData:
    with open(ARXIV_WORDS_FILE, 'rb') as f:
        words = pkl.load(f)
    words_dict = None
        
    with open(ARXIV_CITES_FILE, 'rb') as f:
        refs = pkl.load(f)
    with open(ARXIV_CITES_DICT_FILE, 'rb') as f:
        refs_dict = pkl.load(f)
        
    with open(ARXIV_FEATS_FILE, 'rb') as f:
        feats = pkl.load(f)
    with open(ARXIV_FEATS_DICT_FILE, 'rb') as f:
        feats_dict = pkl.load(f)
        
    with open(ARXIV_DOC_IDS_FILE, 'rb') as f:
        doc_ids = pkl.load(f)
        
    assert words.shape[0] == refs.shape[0] == feats.shape[0] == len(doc_ids), \
        f"words.shape = {words.shape}, refs.shape={refs.shape}, feats.shape={feats.shape}" \
        f"len(doc_ids) = {len(doc_ids)}"
    
    return RawData(
        words=LabelledMatrix(values=words, labels=words_dict),
        feats=LabelledMatrix(values=feats, labels=feats_dict),
        cites=LabelledMatrix(values=refs, labels=refs_dict),
        row_labels=doc_ids
    )

_x = arxiv()
str(_x)

In [None]:
NIPS_DIR = DATASET_DIR / 'NIPS'
NIPS_WORDS_FILE = NIPS_DIR / 'words.pkl'
NIPS_DICT_FILE = NIPS_DIR / 'dict.pkl'
NIPS_AUTHORS_FILE = NIPS_DIR / 'authors.pkl'
NIPS_AUTHORS_DICT_FILE = NIPS_DIR / 'authors.dict'
NIPS_AUTHORS_FILE_4 = NIPS_DIR / 'authors4.pkl'
NIPS_REFS_FILE = NIPS_DIR / 'refs.pkl'
NIPS_REFS_DICT_FILE = NIPS_DIR / 'refs.dict'
NIPS_REFS_FILE_4 = NIPS_DIR / 'refs4.pkl'
NIPS_CATS_FILE = NIPS_DIR / 'cats.pkl'
NIPS_CATS_DICT_FILE = NIPS_DIR / 'cats.dict'

# From the "dicts.py" fle.
NIPS_AUTHORS_DICT_FILE = NIPS_AUTHORS_DICT_FILE.parent / (NIPS_AUTHORS_DICT_FILE.name + ".patched")
NIPS_CATS_DICT_FILE = NIPS_CATS_DICT_FILE.parent / (NIPS_CATS_DICT_FILE.name + ".patched")
NIPS_REFS_DICT_FILE = NIPS_REFS_DICT_FILE.parent / (NIPS_REFS_DICT_FILE.name + ".patched")

for p in [NIPS_WORDS_FILE, NIPS_DICT_FILE,
          NIPS_AUTHORS_FILE, NIPS_AUTHORS_DICT_FILE, NIPS_AUTHORS_FILE_4,
          NIPS_REFS_FILE, NIPS_REFS_DICT_FILE, NIPS_REFS_FILE_4,
          NIPS_CATS_FILE, NIPS_CATS_DICT_FILE]:
    assert p.exists(), p

In [None]:
def nips() -> RawData:
    with open(NIPS_WORDS_FILE, 'rb') as f:
        words = pkl.load(f)
    with open(NIPS_DICT_FILE, 'rb') as f:
        words_dict = pkl.load(f)
    logging.info(f"words.shape = {words.shape}, len(words_dict) = {len(words_dict)})")
        
    with open(NIPS_AUTHORS_FILE, 'rb') as f:
        authors = pkl.load(f)
    with open(NIPS_AUTHORS_DICT_FILE, 'rb') as f:
        authors_dict = pkl.load(f)
    authors_dict = [f'auth-{a}' for a in authors_dict]
    logging.info(f"authors.shape = {authors.shape}, len(authors_dict) = {len(authors_dict)})")
        
    with open(NIPS_CATS_FILE, 'rb') as f:
        cats = pkl.load(f)
    with open(NIPS_CATS_DICT_FILE, 'rb') as f:
        cats_dict = pkl.load(f)
    cats_dict = [f'cat-{c}' for c in cats_dict]
    logging.info(f"cats.shape = {cats.shape}, len(cats_dict) = {len(cats_dict)})")
        
    with open(NIPS_REFS_FILE_4, 'rb') as f:
        refs = pkl.load(f)
    with open(NIPS_REFS_DICT_FILE, 'rb') as f:
        refs_dict = pkl.load(f)
    refs_dict = [f'ref-{r}' for r in refs_dict]
    logging.info(f"refs.shape = {refs.shape}, len(refs_dict) = {len(refs_dict)})")
        
    feats = ssp.hstack((authors, cats, refs))
    feats_dict = authors_dict + cats_dict + refs_dict
    
    return RawData(
        words=LabelledMatrix(values=words, labels=words_dict),
        feats=LabelledMatrix(values=feats, labels=feats_dict)
    )

_n = nips()
str(_n)

In [None]:
REUTERS_DIR = DATASET_DIR / 'reuters'
REUTERS_WORDS_FILE = REUTERS_DIR / 'W.pkl'
REUTERS_DICT_FILE = REUTERS_DIR / 'dict.pkl'

for p in [REUTERS_DIR, REUTERS_WORDS_FILE, REUTERS_DICT_FILE]:
    assert p.exists(), p

In [None]:
def reuters() -> RawData:
    with open(REUTERS_WORDS_FILE, 'rb') as f:
        words = pkl.load(f)
    with open(REUTERS_DICT_FILE, 'rb') as f:
        words_dict = pkl.load(f)
        
    return RawData(words=LabelledMatrix(values=words, labels=words_dict))

_r = reuters()
str(_r)

In [None]:
_r.words.values[95,:].data

In [None]:
_r

In [None]:
for r in range(100):
    print(f"{r:03d}  {np.percentile(a=_r.words.values[r,:].data, q=[i/0.1 for i in range(11)]).astype(np.int32)}")

In [None]:
_r.words.values.mean(axis=0)