In [1]:
### Utils
import pandas as pd
import time

### Tensorflow related
import tensorflow as tf
from tensorflow.contrib import rnn  #cell that we would use
import tensorlayer as tl

In [2]:
### load the small dataset
df = pd.read_csv('output/corpus/processed_2.csv')
contents = df.content.tolist()
summaries = df.summary.tolist()

In [3]:
print('count lines of contents:', len(contents))
print('count lines of summaries:', len(summaries))

count lines of contents: 134982
count lines of summaries: 134982


In [4]:
corpus = contents + summaries

In [5]:
import re
import collections
import pickle
import numpy as np

In [6]:
corpus[0]

'alone take one either sit brc wife took instructor get take msf pivotal beneficial brc experience wife instructors couple good ol boys earning little extra cash weekends believe many new riders put much stock msf course instructor say though end motorcycling technique curriculum essentially defensive driving techniques taught high school drivers ed magic bean keeps dying street magic common sense pill either class geared toward cruiser riders well myths front brake vs rear brake actually taught class husband difficult refute misinformation given motorcycling god placed fate edit would like add 6 miles 2nd ride went wide right hander froze controls crashed grass behind watch hear chatterboxes whole thing uninjured baby ninja rideable week passed brc instructors said top class'

In [7]:
processed_corpus = []
for line in corpus:
    line = tl.nlp.process_sentence(line, start_word="<GO>", end_word="<EOS>")
    processed_corpus.append(line)

print('processed corpus: line 0 ---',processed_corpus[0])

processed corpus: line 0 --- ['<GO>', 'alone', 'take', 'one', 'either', 'sit', 'brc', 'wife', 'took', 'instructor', 'get', 'take', 'msf', 'pivotal', 'beneficial', 'brc', 'experience', 'wife', 'instructors', 'couple', 'good', 'ol', 'boys', 'earning', 'little', 'extra', 'cash', 'weekends', 'believe', 'many', 'new', 'riders', 'put', 'much', 'stock', 'msf', 'course', 'instructor', 'say', 'though', 'end', 'motorcycling', 'technique', 'curriculum', 'essentially', 'defensive', 'driving', 'techniques', 'taught', 'high', 'school', 'drivers', 'ed', 'magic', 'bean', 'keeps', 'dying', 'street', 'magic', 'common', 'sense', 'pill', 'either', 'class', 'geared', 'toward', 'cruiser', 'riders', 'well', 'myths', 'front', 'brake', 'vs', 'rear', 'brake', 'actually', 'taught', 'class', 'husband', 'difficult', 'refute', 'misinformation', 'given', 'motorcycling', 'god', 'placed', 'fate', 'edit', 'would', 'like', 'add', '6', 'miles', '2nd', 'ride', 'went', 'wide', 'right', 'hander', 'froze', 'controls', 'crash

In [None]:
#################### load numberbatch word-embedding
embeddings_index = {}
with open('downloads/embedding/numberbatch-en.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

In [None]:
################## Count words (Just for test)
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [None]:
word_counts = {}

count_words(word_counts, summaries)
count_words(word_counts, contents)
            
print("Size of Vocabulary:", len(word_counts))

In [None]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
missing_word_list=[]
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            missing_word_list.append(word)
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

In [14]:
tl.nlp.create_vocab(processed_corpus, word_counts_output_file='output/corpus/vocab.txt', min_word_count=4)

[TL] Creating vocabulary.
[TL]     Total words: 259872
[TL]     Words in vocabulary: 87760
[TL]     Wrote vocabulary file: output/corpus/vocab.txt


<tensorlayer.nlp.SimpleVocabulary at 0x19ee65d5f28>

In [9]:
vocab = tl.nlp.Vocabulary('output/corpus/vocab.txt', start_word="<GO>", end_word="<EOS>", unk_word="<UNK>")

[TL] Initializing vocabulary from file: output/corpus/vocab.txt
[TL] Vocabulary from output/corpus/vocab.txt : <GO> <EOS> <UNK>
[TL]     vocabulary with 87761 words (includes start_word, end_word, unk_word)
[TL]       start_id: 1
[TL]       end_id  : 2
[TL]       unk_id  : 87760
[TL]       pad_id  : 0


In [13]:
print(vocab.id_to_word(0),vocab.id_to_word(1),vocab.id_to_word(2),
      vocab.id_to_word(3),vocab.id_to_word(4),vocab.id_to_word(5),
      vocab.id_to_word(6),vocab.id_to_word(7),vocab.id_to_word(8))

<PAD> <GO> <EOS> . , the would to like


In [16]:
vocab.word_to_id('<UNK>')

87760

In [20]:
vocab.id_to_word(87760)

'<UNK>'

In [None]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<S/>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count