In [22]:
import os
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary


In [23]:
def preprocess(docs,no_below=20,no_above=0.7):
    # input is a an array of docs; each is one string
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]   

    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    # Delete words based on their frequency in the whole corps
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    #set_trace()
    # Filter out words that occur less than 20 documents, or more than 70% of the documents.
    dictionary.filter_extremes(no_below, no_above)
    
    # According to the filtered dictionary, reconstruct the corpus
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return corpus, dictionary
    # TODO: linebreaks should be recovered into single word? search for solution first


In [51]:
years = list(range(22, 40))
dirs = ['text_data/volume_{}/'.format(y) for y in years]

In [52]:
ndocs = []
docs = []
import codecs


for d in dirs:
    fnames = os.listdir(d)
    # start with 1/10 of the data
    fnames = [t[1] for t in enumerate(fnames) if t[0] % 10 == 0]
    for fn in fnames:
        with open(d + fn, 'r', encoding='utf-8') as f:
            docs.append(f.read())
    ndocs.append(len(fnames))

In [53]:
corpus, dictionary = preprocess(docs,no_below=3,no_above=0.4)

In [54]:

import logging
from gensim.models import ldaseqmodel
from gensim.models.wrappers.dtmmodel import DtmModel

from gensim.corpora import Dictionary, bleicorpus, textcorpus
import numpy as np
from gensim.matutils import hellinger
import time


In [58]:
t0= time.time()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

dtm_path = "/Users/alex/dtm/bin/dtm-darwin64"
model = DtmModel(dtm_path, corpus=corpus, time_slices=ndocs, num_topics=5, id2word=dictionary, initialize_lda=True)

dt = time.time() - t0


In [61]:
yeardata = []
for t in range(len(ndocs)):
    yeardata.append(model.dtm_vis(corpus, time=t))

In [72]:
import pickle
pickle.dump(yeardata, open( "output/onetenthfivetopics.p", "wb" ))