In [None]:
# so we can use packages from parent directory
import sys
sys.path.append("..")

In [None]:
# The goal of this notebook is to write the code that creates a vocabulary for the entire monroe 2017 corups.

In [3]:
from collections import Counter
from monroe_data import MonroeData
import nltk

In [2]:
monroe_data = MonroeData("filteredCorpus.csv") # use default arguments to get the dataframe in the same form Monroe used it

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
class Vocab:
    
    def __init__(self):
        self.UNK = '<unk>'
        self.EOS = '<eos>'
        self.SOS = '<sos>'
        
        
        self.word2idx = {self.SOS: 0}
        self.idx2word = {0: self.SOS}
        self.word_count = Counter()
        self.size = 0
        
        
    def add_sentence(self, sentence):
        for word in sentence:
            word = word.lower()
            if not word in self.word2idx:
                self.word2idx[word] = self.size
                self.idx2word[self.size] = word
                self.size += 1
            self.word_count[word] += 1
        
    def get_word_from_idx(self, idx):
        return self.idx2word[idx]
    
    def get_idx_from_word(self, word):
        return self.word2idx.get(word, self.word2idx[self.UNK])
    
    def replace_unks(self, sentence, word_counts, word_lim=1, speaker=False):
        replace_sentence = []
        
        # the speaker needs to start with an SOS token to seperate colors
        # from words, but the listener doesn't have to
        if speaker:
            replace_sentence.append(self.SOS)
            
        for word in sentence:
            word = word.lower()
            if word_counts[word] <= word_lim:
                replace_sentence.append(self.UNK)
            else:
                replace_sentence.append(word)
                
        replace_sentence.append(self.EOS)
        return replace_sentence
    
    def __len__(self):
        return self.size

In [4]:
monroe_data.data.contents[:10]

0                The darker blue one
1                             purple
2    Medium pink the medium dark one
3                               lime
4                        Mint green.
5                          Mud brown
6                          Mud brown
7                         Camo green
8                        Darkish red
9                               Grey
Name: contents, dtype: object

In [7]:
# iterate over sentences
# tokenize them
# create a vocab
# filter the vocab
word_counter = Counter()
for sent in monroe_data.data.contents:
    for word in nltk.word_tokenize(sent):
        word_counter[word.lower()] += 1

In [11]:
word_counter.most_common()

[('green', 10313),
 ('the', 9791),
 ('blue', 8226),
 ('purple', 7842),
 ('not', 6078),
 ('pink', 4249),
 ('one', 3959),
 (',', 3645),
 ('grey', 3626),
 ('bright', 3569),
 ('gray', 2605),
 ('yellow', 2163),
 ('.', 2074),
 ('of', 2017),
 ('is', 1835),
 ('color', 1803),
 ('red', 1782),
 ('brown', 1578),
 ('orange', 1561),
 ('darker', 1545),
 ('more', 1334),
 ('dark', 1308),
 ('dull', 1302),
 ('brightest', 1259),
 ('it', 1212),
 ('a', 1172),
 ('light', 1130),
 ('that', 1018),
 ('brighter', 1000),
 ('or', 952),
 ('neon', 945),
 ('most', 905),
 ('and', 781),
 ('darkest', 706),
 ('teal', 702),
 ('to', 700),
 ('with', 662),
 ('lighter', 653),
 ('olive', 577),
 ('lime', 572),
 ('greenish', 569),
 ('like', 566),
 ('but', 565),
 ("'s", 558),
 ('two', 520),
 ('...', 503),
 ('tan', 497),
 ('i', 496),
 ('!', 495),
 ('sky', 493),
 ('?', 492),
 ('shade', 451),
 ('this', 440),
 ('brownish', 422),
 ('no', 422),
 ('greyish', 416),
 ('target', 403),
 ('pinkish', 402),
 ('in', 399),
 ('hot', 394),
 (')', 3

In [10]:
len(monroe_data.data.contents)

46994

In [28]:
vocab = Vocab()

In [15]:
nltk.word_tokenize("I am a cow.")

['I', 'am', 'a', 'cow', '.']

In [29]:
all_sents = []
for sent in monroe_data.data.contents:
    sent = nltk.word_tokenize(sent)
    unk_sent = vocab.replace_unks(sent, word_counter)
    all_sents.append(unk_sent)
    vocab.add_sentence(unk_sent)

In [30]:
len(vocab)

1792

In [34]:
vocab.get_idx_from_word("adsf")

28

In [35]:
import pickle

In [36]:
with open("listener_vocab.pkl", "wb") as listener_vocab_f:
    pickle.dump(vocab, listener_vocab_f)