In [1]:
import shorttext
import numpy as np
import pandas as pd

import re

In [2]:
inaugural = pd.read_csv('inaugural.csv')

In [3]:
stem = lambda word: shorttext.utils.textpreprocessing.stemword(word)
pipeline = [lambda s: re.sub('[^\w\s]', '', s),
            lambda s: re.sub('[\d]', '', s),
            lambda s: s.lower(),
            lambda s: ' '.join(map(stem, shorttext.utils.tokenize(s)))
           ]
txtpreproceesor = shorttext.utils.text_preprocessor(pipeline)

In [4]:
docids = list(inaugural['yrprez'])
corpus = [txtpreproceesor(speech).split(' ') for speech in inaugural['speech']]

The basic difference between the term-document matrix and document term matrix is that the weighting of the term-document matrix is based on the term frequency (TF) and in the document term matrix the weighting is based on term frequency-inverse document frequency(TF-IDF).

In [5]:
dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=False)

In [6]:
dtm.get_doc_tokens('2009-Obama')

{'': 44.0,
 'a': 58.0,
 'act': 1.0,
 'advanc': 2.0,
 'again': 2.0,
 'against': 1.0,
 'all': 9.0,
 'american': 5.0,
 'an': 3.0,
 'and': 111.0,
 'ar': 22.0,
 'assur': 1.0,
 'at': 8.0,
 'be': 12.0,
 'been': 8.0,
 'befor': 4.0,
 'between': 3.0,
 'birth': 2.0,
 'bless': 2.0,
 'bring': 1.0,
 'but': 20.0,
 'by': 8.0,
 'call': 2.0,
 'can': 13.0,
 'care': 3.0,
 'charact': 1.0,
 'charter': 2.0,
 'chosen': 1.0,
 'citizen': 1.0,
 'civil': 1.0,
 'collect': 1.0,
 'commun': 1.0,
 'confid': 2.0,
 'conflict': 2.0,
 'consequ': 1.0,
 'consid': 1.0,
 'continu': 1.0,
 'could': 1.0,
 'countri': 2.0,
 'crisi': 4.0,
 'dai': 5.0,
 'decid': 1.0,
 'decis': 1.0,
 'declin': 1.0,
 'defin': 2.0,
 'depend': 2.0,
 'destini': 1.0,
 'dure': 1.0,
 'duti': 2.0,
 'economi': 3.0,
 'effect': 1.0,
 'equal': 1.0,
 'everi': 8.0,
 'expedi': 1.0,
 'ey': 2.0,
 'faith': 3.0,
 'far': 4.0,
 'favor': 1.0,
 'fellow': 1.0,
 'fill': 1.0,
 'final': 1.0,
 'for': 23.0,
 'forc': 3.0,
 'former': 1.0,
 'foundat': 1.0,
 'free': 2.0,
 'from': 5.

In [7]:
dtm.get_token_occurences(stem('change'))

{'1801-Jefferson': 1.0,
 '1805-Jefferson': 2.0,
 '1809-Madison': 1.0,
 '1821-Monroe': 3.0,
 '1837-VanBuren': 1.0,
 '1841-Harrison': 4.0,
 '1853-Pierce': 1.0,
 '1861-Lincoln': 4.0,
 '1873-Grant': 2.0,
 '1877-Hayes': 3.0,
 '1881-Garfield': 3.0,
 '1889-Harrison': 1.0,
 '1897-McKinley': 3.0,
 '1905-Roosevelt': 1.0,
 '1909-Taft': 10.0,
 '1913-Wilson': 4.0,
 '1917-Wilson': 1.0,
 '1921-Harding': 2.0,
 '1925-Coolidge': 1.0,
 '1929-Hoover': 1.0,
 '1933-Roosevelt': 2.0,
 '1937-Roosevelt': 5.0,
 '1949-Truman': 1.0,
 '1953-Eisenhower': 2.0,
 '1957-Eisenhower': 4.0,
 '1961-Kennedy': 1.0,
 '1965-Johnson': 12.0,
 '1969-Nixon': 1.0,
 '1977-Carter': 1.0,
 '1985-Reagan': 4.0,
 '1989-Bush': 2.0,
 '1993-Clinton': 11.0,
 '1997-Clinton': 2.0,
 '2001-Bush': 1.0,
 '2009-Obama': 2.0}

In [8]:
dtm.get_doc_frequency(stem('change'))

35