In [3]:
import os

import nltk

# List of all the testimony files.
testimonies = [''.join(['./interviews/',f]) for f in os.listdir('./interviews/') if f[-4:] == '.txt']

nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [18]:
# TFIDF tutorial from: http://www.bogotobogo.com//python/NLTK/tf_idf_with_scikit-learn_NLTK.php
# To be somewhat more "Pythonic", let us use generator expressions so as not to eat *all* the RAM.

import string

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.porter import PorterStemmer

def get_tokens(txt):
    stemmer = PorterStemmer()
    return (t for t in (stemmer.stem(token) for token in nltk.word_tokenize(txt)) if len(t)>2)
    
tfidf = TfidfVectorizer(tokenizer=get_tokens, stop_words='english')

all_the_docs = lambda: (open(f,'rb').read().lower().translate(None, string.punctuation) for f in testimonies)

%time tfs = tfidf.fit_transform(all_the_docs())
 

CPU times: user 2min 54s, sys: 928 ms, total: 2min 55s
Wall time: 2min 59s


In [19]:
import random

random_testimony = lambda: open(random.choice(testimonies),'rb').read().lower().translate(None, string.punctuation)

txt = random_testimony()
response = tfidf.transform([txt])

feature_names = tfidf.get_feature_names()
terms = [ (feature_names[col],response[0, col]) for col in response.nonzero()[1]]
top_terms = sorted(terms,key= lambda t: -t[1])[0:20]

for t in top_terms:
    print t[0] + ' - ' + t[1].__str__()


thi - 0.347922024504
becaus - 0.256511146634
rg500300180 - 0.255905444596
didnt - 0.248303708941
told - 0.178530036962
went - 0.158607257005
dont - 0.143836894456
came - 0.131310917203
kill - 0.127708390908
want - 0.123942762512
andand - 0.122919300209
brother - 0.119415638251
babi - 0.118350679152
know - 0.115880422066
say - 0.105607037982
time - 0.103645170859
tyczyn - 0.102362177839
sister - 0.0947885644338
goy - 0.0944377896587
pollack - 0.0909497218681


In [21]:
# Grab the proper nouns from a testimony:
txt2 = random_testimony()
tokens2 = nltk.word_tokenize(txt2)
tags2 = nltk.pos_tag(tokens2)
entities2 = nltk.chunk.ne_chunk(tags2)

[e[0] for e in entities2 if e[1] == 'NNP']
    

['mengele']

In [22]:
print txt2


united states holocaust memorial museum

interview with erich kulka
june 8 1990
rg500300119



preface
the following oral history testimony is the result of a videotaped interview with erich
kulka conducted by linda kuzmack on june 8 1990 on behalf of the united states holocaust
memorial museum the interview took place in washington dc and is part of the united states
holocaust memorial museum’s collection of oral testimonies rights to the interview are held by the
united states holocaust memorial museum
the reader should bear in mind that this is a verbatim transcript of spoken rather than
written prose this transcript has been neither checked for spelling nor verified for accuracy and
therefore it is possible that there are errors as a result nothing should be quoted or used from this
transcript without first checking it against the taped interview



erich kulka
june 8 1990

q

mr kulka could you identify yourself give me your name and where and when you were
born please

a

i was 

In [1]:
# Gensim tutorial: http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html
import itertools

from gensim.utils import smart_open, simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def gtokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS and len(token) > 2]

def iter_testimonies():
    for t in testimonies:
        title = t.split('/')[-1][:-4]
        #txt = open(t,'rb').read().lower().translate(None, string.punctuation)
        for lin in open(t,'rb').readlines():
            # Ignore lines shorter than 20 chars.
            if len(lin) > 15:
                txt = lin.lower().translate(None, string.punctuation)
                tokens = gtokenize(txt)
                yield title, tokens
            else:
                continue
        
stream = iter_testimonies()

doc_stream = lambda: (tokens for _, tokens in iter_testimonies())


In [5]:
import string

import gensim
%time id2word_testimonies = gensim.corpora.Dictionary(doc_stream())
print(id2word_testimonies)

CPU times: user 11min 16s, sys: 8.3 s, total: 11min 24s
Wall time: 12min
Dictionary(160281 unique tokens: [u'miniszt\xe9rium', u'unpopulated', u'nunnery', u'woodi', u'daytraveling']...)


In [7]:
id2word_testimonies.save('./testimony_dictionary')

In [8]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_testimonies.filter_extremes(no_below=20, no_above=0.1)
print(id2word_testimonies)

Dictionary(15749 unique tokens: [u'raining', u'writings', u'deferment', u'progressively', u'sonja']...)


In [9]:
# Bag of words...
#bow = id2word_testimonies.doc2bow(gtokenize(random_testimony()))
#print(bow)[:10]

class TestimonyCorpus(object):
    def __init__(self, dictionary, clip_docs=None):
        self.dictionary = dictionary
        self.clip_docs = clip_docs

    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_testimonies(), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
            
    def __len__(self):
        return self.clip_docs            
        

In [10]:
testimony_corpus = TestimonyCorpus(id2word_testimonies)
vector = next(iter(testimony_corpus))
# what is the most common word in the first testimony?
most_index, most_count = max(vector, key=lambda (word_index, count): count)
print(id2word_testimonies[most_index], most_count)

(u'states', 1)


In [11]:
%time gensim.corpora.MmCorpus.serialize('./testimonies_bow.mm', testimony_corpus)

CPU times: user 14min 19s, sys: 7.7 s, total: 14min 27s
Wall time: 14min 47s


In [None]:
# ...let's train an LDA transformation model...

mm_corpus = gensim.corpora.MmCorpus('./testimonies_bow.mm')
print(mm_corpus)

%time lda_model = gensim.models.LdaModel(mm_corpus, num_topics=10, id2word=id2word_testimonies, passes=4)

In [67]:
lda_model.print_topics(-1)  # print a few most important words for each LDA topic

[(0,
  u'0.078*jews + 0.065*lot + 0.040*somebody + 0.039*kind + 0.035*working + 0.030*people + 0.028*job + 0.026*city + 0.016*soldiers + 0.016*weeks'),
 (1,
  u'0.058*camp + 0.037*different + 0.030*time + 0.027*people + 0.027*group + 0.022*country + 0.022*sort + 0.021*world + 0.019*number + 0.018*close'),
 (2,
  u'0.102*didnt + 0.067*years + 0.042*know + 0.035*old + 0.030*worked + 0.027*stay + 0.023*stayed + 0.022*friend + 0.021*wife + 0.021*bad'),
 (3,
  u'0.100*got + 0.078*yes + 0.065*took + 0.043*happened + 0.042*thing + 0.042*away + 0.041*maybe + 0.039*later + 0.037*know + 0.031*food'),
 (4,
  u'0.086*things + 0.076*interview + 0.056*man + 0.044*says + 0.034*husband + 0.033*killed + 0.032*met + 0.022*village + 0.015*thank + 0.014*early'),
 (5,
  u'0.099*jewish + 0.047*people + 0.039*friends + 0.037*polish + 0.035*big + 0.034*states + 0.031*united + 0.030*army + 0.026*coming + 0.025*person'),
 (6,
  u'0.055*germans + 0.043*germany + 0.035*days + 0.032*night + 0.029*heard + 0.022*bet

In [68]:
#Train a TFIDF model, and then train Latent Semantic Analysis on top of TFIDF...
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_testimonies)
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_testimonies, num_topics=20)


CPU times: user 22.3 s, sys: 52 ms, total: 22.3 s
Wall time: 22.8 s
CPU times: user 3min 59s, sys: 2.63 s, total: 4min 2s
Wall time: 4min 7s


In [69]:
# cache the transformed corpora to disk, for use in later notebooks
%time gensim.corpora.MmCorpus.serialize('./testimony_tfidf.mm', tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize('./testimony_lsa.mm', lsi_model[tfidf_model[mm_corpus]])

CPU times: user 1min 13s, sys: 920 ms, total: 1min 14s
Wall time: 1min 16s
CPU times: user 3min 44s, sys: 3.58 s, total: 3min 48s
Wall time: 3min 54s


In [70]:
tfidf_corpus = gensim.corpora.MmCorpus('./testimony_tfidf.mm')
# `tfidf_corpus` is now exactly the same as `tfidf_model[wiki_corpus]`
print(tfidf_corpus)

lsi_corpus = gensim.corpora.MmCorpus('./testimony_lsa.mm')
# and `lsi_corpus` now equals `lsi_model[tfidf_model[wiki_corpus]]` = `lsi_model[tfidf_corpus]`
print(lsi_corpus)

MmCorpus(1696851 documents, 15749 features, 7404942 non-zero entries)
MmCorpus(1696851 documents, 20 features, 33492161 non-zero entries)


In [71]:
# select top 50 words for each of the 20 LDA topics
top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)

[[u'jews', u'lot', u'somebody', u'kind', u'working', u'people', u'job', u'city', u'soldiers', u'weeks', u'matter', u'door', u'beautiful', u'england', u'immediately', u'organization', u'car', u'problem', u'music', u'march', u'human', u'cross', u'played', u'time', u'came', u'air', u'recall', u'red', u'think', u'occupied', u'uniform', u'czechoslovakia', u'checking', u'naturally', u'january', u'rich', u'major', u'good', u'course', u'marched', u'blood', u'childhood', u'gentile', u'accept', u'mountains', u'center', u'mean', u'terms', u'latvian', u'bar'], [u'camp', u'different', u'time', u'people', u'group', u'country', u'sort', u'world', u'number', u'close', u'hitler', u'looking', u'came', u'walked', u'helped', u'needed', u'anymore', u'barracks', u'sitting', u'change', u'hours', u'arrived', u'october', u'masters', u'case', u'period', u'moment', u'short', u'rabbi', u'evening', u'large', u'usually', u'white', u'prison', u'example', u'normal', u'longer', u'think', u'april', u'whats', u'ended', 

In [None]:
import pyLDAvis.gensim

vis_data = pyLDAvis.gensim.prepare(lda_model, mm_corpus, id2word_testimonies)

In [58]:
pyLDAvis.display(vis_data)