In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import numpy as np
import spacy
from tqdm import tqdm
from collections import Counter

import sys
sys.path.append('..')
from utils import get_windows

In [22]:
MIN_COUNTS = 20
MAX_COUNTS = 100000000# 100 million, basicall keep all high freq words
# words with count < MIN_COUNTS
# and count > MAX_COUNTS
# will be removed

MIN_LENGTH = 15
# minimum document length 
# (number of words)
# after preprocessing

# half the size of the context around a word
HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

In [23]:
nlp = spacy.load('en')

In [24]:
file = '/data/corpus/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00001-of-00100'

with open(file, 'r') as f:
    docs = f.readlines()
   

In [25]:
len(docs)

306068

In [26]:
# store an index with a document
docs = [(i, doc) for i, doc in enumerate(docs)]

In [64]:
def preprocess_1b(docs, nlp, encoder, decoder, min_length, min_counts, max_counts):
    """Tokenize, clean, and encode documents.

    Arguments:
        docs: A list of tuples (index, string), each string is a document.
        nlp: A spaCy object, like nlp = spacy.load('en').
        min_length: An integer, minimum document length.
        min_counts: An integer, minimum count of a word.
        max_counts: An integer, maximum count of a word.

    Returns:
        encoded_docs: A list of tuples (index, list), each list is a document
            with words encoded by integer values.
        word_counts: A list of integers, counts of words that are in decoder.
            word_counts[i] is the number of occurrences of word decoder[i]
            in all documents in docs.
    """

    def clean_and_tokenize(doc):
        text = ' '.join(doc.split())  # remove excessive spaces
        text = nlp(text, tag=True, parse=False, entity=False)
        return [t.lemma_ for t in text
                if t.is_alpha and len(t) > 2 and not t.is_stop]

    tokenized_docs = [(i, clean_and_tokenize(doc)) for i, doc in tqdm(docs)]

    # remove short documents
    n_short_docs = sum(1 for i, doc in tokenized_docs if len(doc) < min_length)
    tokenized_docs = [(i, doc) for i, doc in tokenized_docs if len(doc) >= min_length]
    print('number of removed short documents:', n_short_docs)

    # remove some tokens
    counts = _count_unique_tokens(tokenized_docs)
    tokenized_docs = _remove_tokens(tokenized_docs, counts, min_counts, max_counts)
    n_short_docs = sum(1 for i, doc in tokenized_docs if len(doc) < min_length)
    tokenized_docs = [(i, doc) for i, doc in tokenized_docs if len(doc) >= min_length]
    print('number of additionally removed short documents:', n_short_docs)

    counts = _count_unique_tokens(tokenized_docs)
    word_counts = _create_word_counts(counts, encoder, decoder)

    print('\nminimum word count number:', word_counts[-1])
    print('this number can be less than MIN_COUNTS because of document removal')

    encoded_docs = _encode(tokenized_docs, encoder)
    return encoded_docs, word_counts

def _count_unique_tokens(tokenized_docs):
    tokens = []
    for i, doc in tokenized_docs:
        tokens += doc                       # doc is a list of tokens
    return Counter(tokens)


def _encode(tokenized_docs, encoder):
    return [(i, [x for x in [encoder.get(t, None) for t in doc] if x]) for i, doc in tokenized_docs]     # if encoder does not have t, skip


def _remove_tokens(tokenized_docs, counts, min_counts, max_counts):
    """
    Words with count < min_counts or count > max_counts
    will be removed.
    """
    total_tokens_count = sum(
        count for token, count in counts.most_common()
    )
    print('total number of tokens:', total_tokens_count)

    unknown_tokens_count = sum(
        count for token, count in counts.most_common()
        if count < min_counts or count > max_counts
    )
    print('number of tokens to be removed:', unknown_tokens_count)

    keep = {}
    for token, count in counts.most_common():
        keep[token] = count >= min_counts and count <= max_counts

    return [(i, [t for t in doc if keep[t]]) for i, doc in tokenized_docs]


def _create_word_counts(counts, encoder, decoder):

    total_tokens_count = sum(
        count for token, count in counts.most_common()
    )
    print('total number of tokens:', total_tokens_count)

    length = len(encoder)
    print(length)
    word_counts = [0] * length
 
    i = 0
    for token, count in counts.most_common():
        # counts.most_common() is in decreasing count order
        index = encoder[token]
        print(index, ":", token, ":", count)
        word_counts[index] = count
        
        i += 1
        if i > 5:
            break

    return word_counts


In [38]:
import os
os.getcwd()

import os.path
os.path.isfile('decoder.npy')

True

In [39]:
decoder = np.load('decoder.npy')[()]

In [42]:
encoder = {v: k for k, v in decoder.items()}


In [46]:
len(encoder)

3000000

In [84]:
for i, doc in tqdm(docs):
    if len(doc) < MIN_LENGTH:
        print(doc)

 20%|█▉        | 60343/306068 [00:00<00:00, 299636.77it/s]

Don 't !

Fox , 8 p.m.

Massive !

And going .

437 .

Ducks : Oct .

Kg7 ( 33 ...

TL960 .

( Yeah ! )

Horror show .

Sauna ?

7 at 7 p.m.

GOING UP ?

Grand idea !

Auntie Anne .

Deliveries ?

Both are .

We all did .

7 % .

Now Somalia .

" No ... "

Nor will we .

The media .

( 23.46 ) .

( ECI ) .

And barked .

New or Old ?

The Vandals ?

333 million .

The normal .

Oregon , OK .

12 , 1958 .

Gain of 2 .

2-11 , 2009 .

Lol . "

Eli won .

Jim Lynam .

LOSERS !

Neatly .

I am afraid .

She 's 5 .

Support .

Get in .

Easy folks .

Dusting .

Sun said .

WC .

Déjà what ?

Ankle .

Spare me .

What the ?

The detail .

21,15-17 ) .

Yeah sure !

25 ...

Go on !

193 .

Much cooler .

I wondered .

Up yours !

Some hopes .

In the Nov .

23 , 1986 .

6 , 1983 .

Sauce ?

Dont Pay It !

Vexatious ?

Bag sex .

He is vital .

Right ... ..

My point .

CB radio ?

So ?

" Dammit ! "

Funny ?

Be social .

29 and Feb .

" We had 50 .

ERP ?

Who Cares .

Very nice . "

27 danc

 30%|███       | 92052/306068 [00:00<00:00, 228723.84it/s]

Who cares ? ?

Around 1 p.m.

He cried .

Great fight .

He added .

Let GM die !

Three stars .

Two .

or English .

So listen .

Whew ! 10 .

2 , it said .

The Marlins !

12 : 47 a.m.

30 to Dec .

In Canada .

Isaac asks .

Mont .

TROY , Ala .

It 's cold .

Barney ?

Its a start .

4 : 15 a.m.

My hero !

Thanks Lev !

Rant over .

The best ...

events .

He slowed .

Hi , Gary !

Bad move .

A rumor .

Nazi gold .

Riiight .

Fervor ?

23-24 .

Gentlemen ?

Jeanine !

It was mine .

Enormous .

EDEN , N.Y.

Mike Hart !

Let it die .

Begads !

21 , 1993 .

Prayer list ?

Stay loose .

11 : 21 p.m.

2010 .

19 passes .

" Har-oh ?

I am veiled .

Big salary .

TAX POT ! ! !

But look .

CST ( 2 p.m.

314 .

And He has .

MERION , Pa .

Doubt it .

He 's ready .

It was Kim .

million .

Silences .

Use it . "

Duffy .

Rates vary .

Tries ...

So elegant !

12 : 49 a.m.

Oh , please .

Bin it .

18 and 1 a.m.

Do or die .

Can he dunk ?

11 : 05 GMT .

" Imshi ! "

Ma : No .

No

 49%|████▉     | 149787/306068 [00:00<00:00, 208886.69it/s]

Timeless .

Large House .

At 9p .

We signed .

Mississippi ?

Stardom ?

My apology .

The rabbi ! "

2 , 1928 .

1 , 2008 .

Great upset ?

Nahhh .

Treatment ! !

Forget red .

After Oct .

Opium .

Or less ?

AVE : 4.86 .

32545735 .

It 's early .

The Catch .

A speakeasy ?

Ashley Cole ?

A mall ?

9 : 33 p.m.

A tornado .

The Lurker .

Traitorz .

Etc .

He passed .

All at once .

What ? 2 .

" It 's big .

Kerry .

Java Man .

KATRINA .

And did he ?

Woolworth ?

" Favorites ?

BE A PARENT !

Paz ordered .

Perhaps so .

Not us .

3 min : ...

Paul Oct .

" Obama !

Shhhhhhhhh .

My hair .

Ka-CHING !

Hurricanes .

Take Hatch .

Fleeter .

So I tithed .

" Go China !

498 pages .

Muscle back .

" I love it .

" Not one .

Ray ?

19-27 .

Desperate .

Baloney !

Hail CISA .

How sincere !

" As what ? "

CeBIT .

Sound wacky ?

So vivid .

" In a Sept .

Hang on .

Crash ! " ?

Dale !

And in time .

Hell no ! !

England ! "

Post Co .

TOKYO , Nov .

30- Dec .

Wooh !

C

 70%|███████   | 214419/306068 [00:00<00:00, 227277.17it/s]


8 : 29 p.m.

Now ..

Origami ?

Rubble .

RG What ?

A mess .

Comfy hold .

A sad story .

Kudos ABC !

Marc Rich ?

Metroliner .

14 at 10 p.m.

( 38 % ) .

Troubled ?

" No !

11 : 09 a.m.

Kromer said .

It 's now .

A cousin .

Very wrong .

I love film .

2 ' 27 .

" Fire . "

I am angry .

8 and Sept .

25 , 1912 .

Snack cups .

R1 Y16 .

Spare parts ?

Cody : Yes .

11 trials .

Stewart ?

Dracula .

Chief .

Height 2ft .

Nice suit .

It 's cute .

Clinton .

Vroom vroom !

DAVIES : No .

Thanks Adam !

HONK ! ! !

Hehhhhh ..

Search me .

The victim ?

Big Daddy ?

( 7 : 45 p.m.

The ACLU ! !

3 , he said .

Nielan said .

Not forever .

" How come ?

YIKES !

No air-con .

You ski ?

Lala .

3-5 , 2007 .

For sale .

Umm good .

Burned !

ET on CNN .

There are ...

" At all .

To persuade ?

All of Them .

( 8 : 40 p.m.

" 3 .

None opened .

Good read .

' Zat so ?

I giggle .

Break over .

27 and Oct .

Am gutted .

Runaway cow ?

Awesome ! ! !

Grit blew .

Murky .

B

100%|██████████| 306068/306068 [00:01<00:00, 251950.31it/s]

He was busy .

9 at 8 : 30 .

Very jolly .

Silly me !

Who has won ?

We aren 't .

PDT Monday .

He 's human .

Sunoco Inc .

8-10 , Nov .

1 , 1926 .

He was .

Revenge .

And it did .

He scoffs .

But I could .

" Hey Ya " ?

Indulged .

Auto-squee .

Venice 3 .

RYAN : I do .

Join up .

April 25 .

Ho-hum .

2-8 , 2009 .

" Omigosh .

T-Pain ?

So on Sept .

9 report .

Collarbones ?

DETROIT .

I ...

He should ...

Need e-mail ?

Mortifying .

He improved .

Huge relief .

Gain of 6 .

Are there ?

Any clues ?

Why is this ?

Himself ! 2 .

" Bless you .

I was four .

the venture .

13.53 .

Merit pay .

They called .

Kathy asked .

But hold .

ABC Nov .

For free ?

Come on .

Entry £ 7 .

Or slower .

Savvy ?

9 Jun 2009 .

You lose .

" Hungry !

Bullseye !

Back to You .

4 , 1991 .

Road trip !

" What for ?

Or say so .

Glam rock ...

Ohho !

EDT / 6 p.m.

Um , no .

18 rankings .

Sirens .

Buzz off .

A bit odd .

Jenkins Row .

Be safe .

Ooops !

1 -- Dec .

I loo




In [65]:
encoded_docs, word_counts = preprocess_1b(
    docs, nlp, encoder, decoder, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS
)

100%|██████████| 306068/306068 [00:59<00:00, 5182.17it/s]


number of removed short documents: 222583
total number of tokens: 1643118
number of tokens to be removed: 188080
number of additionally removed short documents: 22530
total number of tokens: 1165768
3000000
159 : say : 15106
36 : year : 7661
65 : new : 6967
389 : include : 3780
121 : government : 3756
100 : company : 3691

minimum word count number: 0
this number can be less than MIN_COUNTS because of document removal


In [53]:
encoded_docs

[(0,
  [2055,
   1935,
   509,
   5075,
   2613,
   8031,
   195,
   273,
   398,
   6409,
   3625,
   4861,
   159,
   4004,
   3849,
   1129,
   195,
   113,
   643,
   456,
   820,
   341,
   3468,
   288,
   35509,
   5037]),
 (1,
  ['',
   6684,
   306,
   3435,
   995,
   12565,
   9053,
   18901,
   1014,
   202,
   '',
   2612,
   1262,
   389,
   1027627,
   1287,
   550,
   758,
   4306,
   842,
   470365]),
 (6,
  [62605,
   6147,
   '',
   263323,
   584,
   348,
   898984,
   83813,
   149564,
   152192,
   6086,
   2842,
   214760,
   111,
   465,
   625,
   1221,
   2260,
   13099,
   581,
   159,
   101930]),
 (7,
  [173,
   1147,
   3027,
   12141,
   874,
   872,
   1342,
   1628,
   8701,
   635,
   141,
   338,
   139693,
   635,
   522]),
 (14,
  [36,
   268714,
   5593,
   216,
   706,
   417,
   1374,
   7298,
   424,
   747,
   3251,
   547,
   3097,
   758,
   423875,
   1696]),
 (18,
  [1343,
   598,
   142,
   1189,
   629,
   795,
   3770,
   12458,
   914,


In [51]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

In [None]:
np.save('unigram_distribution.npy', unigram_distribution)

In [66]:
data = []
# new ids are created here
for index, (_, doc) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

60955it [00:03, 15703.56it/s]


In [67]:
np.save('data.npy', data)

finally, the random initial topic matrix

In [68]:
len(encoded_docs)

60955

In [70]:
n_topics = 300

In [76]:
def softmax(x):
    # x has shape [batch_size, n_classes]
    e = np.exp(x)
#     print(e)
    n = np.sum(e, 0, keepdims=True)
    return e/n

In [75]:
random_factors = np.random.randn(4)
print(random_factors)
normalized_factors = softmax(random_factors)
print(normalized_factors)

[ 1.82768432  1.12026938 -0.90940189 -0.54326877]
[ 6.21946768  3.06567992  0.40276505  0.58084649]
[ 0.60566886  0.29854434  0.03922237  0.05656443]


In [78]:
doc_weights_init = np.zeros((len(encoded_docs), n_topics))
for i in tqdm(range(len(encoded_docs))):
    random_factors = np.random.randn(n_topics)
    normalized_factors = softmax(random_factors)
    for j in range(n_topics):
        doc_weights_init[i, j] = normalized_factors[j]
        

100%|██████████| 60955/60955 [00:09<00:00, 6660.91it/s]


In [80]:
doc_weights_init[0:]

array([[ 0.00633234,  0.00461978,  0.00091612, ...,  0.00215298,
         0.02045495,  0.00393548],
       [ 0.00030155,  0.00448764,  0.00238312, ...,  0.009518  ,
         0.00171193,  0.00221205],
       [ 0.00284973,  0.00334791,  0.00311595, ...,  0.00481268,
         0.00032616,  0.00070143],
       ..., 
       [ 0.00655458,  0.00060633,  0.00074119, ...,  0.01404162,
         0.0058208 ,  0.00193271],
       [ 0.00556007,  0.00389174,  0.00042184, ...,  0.00259732,
         0.00431261,  0.00208039],
       [ 0.01044219,  0.0013384 ,  0.01074111, ...,  0.0018288 ,
         0.00433661,  0.00247377]])

In [81]:
np.save('doc_weights_init.npy', doc_weights_init)