In [1]:
import pymongo
from redis import Redis

REDIS = Redis(host='this_redis')
mongo_client = pymongo.MongoClient('this_mongo')
corpus_db = mongo_client.corpus

In [2]:
cd ..

/home/jovyan


In [3]:
import pandas as pd

In [4]:
corpus = pd.read_pickle('data/corpus.p')

In [5]:
corpus_df_records = corpus.to_dict('records')

In [6]:
corpus_db.documents.drop()
corpus_db.documents.insert_many(corpus_df_records)

<pymongo.results.InsertManyResult at 0x7ff8e00de750>

In [7]:
corpus_db.documents.count()

170

In [8]:
corpus_db.documents.find_one({'tokens' : {'$exists': False}})

{'_id': ObjectId('5a91c454ce14f80061525963'),
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon'}

# `MAPPER`

In [9]:
def tokenize(document):
    return (document
            .replace(',','')
            .replace('.','')
            .split())

def MAPPER(document):
    for word in tokenize(document):
        yield (word, 1)

In [10]:
doc = corpus_db.documents.find_one({'tokens' : {'$exists': False}})
list(MAPPER(doc['sentence']))

[('In', 1),
 ('the', 1),
 ('great', 1),
 ('green', 1),
 ('room', 1),
 ('There', 1),
 ('was', 1),
 ('a', 1),
 ('telephone', 1),
 ('And', 1),
 ('a', 1),
 ('red', 1),
 ('balloon', 1)]

In [11]:
unprocessed_filter = {'processed' : {'$exists': False}}
doc = corpus_db.documents.find_one(unprocessed_filter)
while doc:
    id_filter = { '_id' : doc['_id'] }
    tokens = list(MAPPER(doc['sentence']))
    update = { '$set' : {'tokens' : tokens, 'processed' : 'tokenized'} }
    corpus_db.documents.update_one(id_filter, update)
    doc = corpus_db.documents.find_one({'tokens' : {'$exists': False}})

In [12]:
corpus_db.documents.find_one()

{'_id': ObjectId('5a91c454ce14f80061525963'),
 'processed': 'tokenized',
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon',
 'tokens': [['In', 1],
  ['the', 1],
  ['great', 1],
  ['green', 1],
  ['room', 1],
  ['There', 1],
  ['was', 1],
  ['a', 1],
  ['telephone', 1],
  ['And', 1],
  ['a', 1],
  ['red', 1],
  ['balloon', 1]]}

# `COLLECTOR`

In [13]:
def COLLECTOR(document, vocabulary):
    for token in doc['tokens']:
        REDIS.sadd(vocabulary, token[0])
        REDIS.rpush(*token)

In [14]:
tokenized_filter = {'processed' : 'tokenized'}
doc = corpus_db.documents.find_one(tokenized_filter)
doc

{'_id': ObjectId('5a91c454ce14f80061525963'),
 'processed': 'tokenized',
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon',
 'tokens': [['In', 1],
  ['the', 1],
  ['great', 1],
  ['green', 1],
  ['room', 1],
  ['There', 1],
  ['was', 1],
  ['a', 1],
  ['telephone', 1],
  ['And', 1],
  ['a', 1],
  ['red', 1],
  ['balloon', 1]]}

In [15]:
while doc:
    id_filter = { '_id' : doc['_id'] }
    tokens = doc['tokens']
    update = { '$set' : {'processed' : 'counted'} }
    COLLECTOR(doc, 'corpus_vocab')
    corpus_db.documents.update_one(id_filter, update)
    doc = corpus_db.documents.find_one(tokenized_filter)

In [16]:
vocabulary = REDIS.smembers('corpus_vocab')
list(vocabulary)[:5]

[b'year', b'you', b"I've", b'out', b'apples']

In [17]:
for word in list(vocabulary)[:5]:
    print(REDIS.lrange(word, 0, -1))

[b'1', b'1']
[b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1']
[b'1', b'1', b'1', b'1', b'1']
[b'1', b'1']
[b'1', b'1', b'1', b'1', b'1', b'1']


# `REDUCER`

In [18]:
def REDUCER(word):
    counts = [int(i) for i in REDIS.lrange(word, 0, -1)]
    return sum(counts)

In [19]:
word_counts = []
for word in vocabulary:
    word_counts.append((word.decode(), REDUCER(word)))

In [20]:
word_counts.sort(key=lambda x: x[1], reverse=True)

In [21]:
word_counts[:10]

[('and', 98),
 ('the', 97),
 ('a', 56),
 ('And', 41),
 ('said', 31),
 ('to', 30),
 ('he', 30),
 ('was', 24),
 ('tree', 21),
 ('of', 20)]