In [16]:
# so we can use packages from parent directory
import sys
sys.path.append("..")

In [17]:
import nltk
from collections import Counter
import torch
import numpy as np
import skimage.color

In [18]:
from monroe_data import MonroeData, MonroeDataEntry, Color

In [88]:
import importlib

In [250]:
importlib.reload(monroe_data)

<module 'monroe_data' from '/Users/benjaminnewman/Documents/Stanford/Freshman_2017-2018/WINTER/LINGUIST130A/linguist-130a-final-proj/monroe_data.py'>

In [20]:
class Tokenizer:
    def tokenize(self, sentence):
        pass

class WhitespaceTokenizer(Tokenizer):
    def tokenize(self, sentence):
        return nltk.word_tokenize(sentence)
    
class EndingTokenizer(Tokenizer):
    """
    Segments endings as different words from words that end with them:
    Ex: 'greener' -> 'green', 'er'
    """
    def __init__(self):
        # Endings defined here:
        # https://github.com/futurulus/colors-in-context/blob/2e7b830668cd039830154e7e8f211c6d4415d30f/tokenizers.py#L35
        self.endings = ['er', 'est', 'ish']
        
    def tokenize(self, sentence):
        tokens = []
        for word in nltk.word_tokenize(sentence):
            inserted = False
            for ending in self.endings:
                if word.endswith(ending):
                    tokens.extend([word[:-len(ending)], '+{}'.format(ending)])
                    inserted = True
                    break
            if not inserted:
                tokens.append(word)
        return tokens


In [53]:
class CaptionIndexer:
    def __init__(self):
        self.UNK = '<unk>'
        self.EOS = '<eos>'
        self.SOS = '<sos>'
        
        
        self.word2idx = {}
        self.idx2word = {}
        self.word_count = Counter()
        self.size = 0
        
        
    def add_sentence(self, sentence):
        for word in sentence:
            word = word.lower()
            if not word in self.word2idx:
                self.word2idx[word] = self.size
                self.idx2word[self.size] = word
                self.size += 1
            self.word_count[word] += 1
        
    def get_word_from_idx(self, idx):
        return self.idx2word[idx]
    
    def get_idx_from_word(self, word):
        return self.word2idx.get(word, self.word2idx[self.UNK])
    
    def to_indices(self, sentence, construct=False):
        if construct:
            self.add_sentence(sentence)
            # we know everything is in the map because we just added it
            return [self.word2idx[word] for word in sentence]
        
        return [self.get_idx_from_word(word) for word in sentence]


In [116]:
class CaptionFeaturizer:
    
    def __init__(self, tokenizer=WhitespaceTokenizer, unk_threshold=1):
        self.tokenizer = tokenizer()
        self.caption_indexer = CaptionIndexer()
        self.word_count = None
        
        # hyperparams
        self.unk_threshold = unk_threshold
    
    def to_tensor(self, caption, construct=False):
        _, indexes = self.to_string_features(caption, construct)
        return torch.tensor(indexes, dtype=torch.long).view(-1, 1)
    
    def to_string_features(self, caption, construct=False):
        """
        Params:
        caption:   string hodling caption that will converted to tokens and
                   indices. 
        construct: if we are constructing the featurizer for the first time,
                   this should be true. It performs the unk substitutions 
                   manually based on the contents of self.word_count and 
                   also adds the sentences to the indexer. Should only be
                   true when training for the first time.
                   
        Returns:
        Tuple(tokens, indices). 
        
        tokens is a tokenized version the passed caption,
                unks are replaced, words are lower cased, buffered on both sides by sos/
                eos tags
        indices is a list indices given by the indexer for each token. These can be converted
                to tensor to be fed into the model
        
        """
        caption_tokens = self.tokenizer.tokenize(caption)
        caption_tokens = self.to_model_format(caption_tokens, construct)
        caption_indices = self.caption_indexer.to_indices(caption_tokens, construct)
        caption_tokens = [self.caption_indexer.get_word_from_idx(index) for index in caption_indices]
        return caption_tokens, caption_indices
        
    def to_model_format(self, tokens, construct):
        """
        Put the tokens into the format expected by the models.
        This mainly entails prepending/appending <sos>, <eos>,
        lowercasing all of the words and replacing all uncommon words
        with <unk> (only in the case when we are constructing the
        featurizer for the first time)
        
        Params:
        tokens: 
        construct: if we are constructing the featurizer for the first time,
                   this should be true. It performs the unk substitutions 
                   manually based on the contents of self.word_count and 
                   also adds the sentences to the indexer. Should only be
                   true when training for the first time.
        """
        if construct:
            if self.word_count is None:
                print("FEATURIZER HAS NOT BEEN CONSTRUCTED YET. Call `construct_featurizer`")
            else:
                for i in range(len(tokens)):
                    if self.word_count[tokens[i]] <= self.unk_threshold:
                        tokens[i] = self.caption_indexer.UNK
                        
        tokens = [token.lower() for token in tokens]
        tokens = [self.caption_indexer.SOS] + tokens + [self.caption_indexer.EOS]
        return tokens
    
    def construct_featurizer(self, data_entries):
        """
        data_entries is of type MonroeData. 
        """
        self.word_count = Counter()
        for entry in data_entries:
            caption_tokens = self.tokenizer.tokenize(entry.caption)
            for token in caption_tokens:
                self.word_count[token] += 1
                
        
    
    

In [251]:
from monroe_data import MonroeData, MonroeDataEntry, Color # last two for reading pkl file
#import Vocab

In [None]:
monroe_data_train = monroe_data.MonroeData("train_corpus_monroe.csv", "train_entries_monroe.pkl")

In [252]:
monroe_data_train = monroe_data.MonroeData("train_corpus_monroe.csv", single_speaker=True, ss_method="pool")

In [253]:
%%time
for _ in monroe_data_train.read_data():
    pass


CPU times: user 1min 8s, sys: 373 ms, total: 1min 8s
Wall time: 1min 8s


In [254]:
monroe_data_train[0].colors

[hsl: [226, 81, 50], rgb [24, 73, 232], hsv [226, 89.50276243093923, 90.5],
 hsl: [283, 87, 50], rgb [176, 17, 239], hsv [283, 93.04812834224599, 93.5],
 hsl: [248, 92, 50], rgb [42, 10, 246], hsv [248, 95.83333333333333, 96.0]]

In [255]:
monroe_data_train.save_entries("train_entries_monroe.pkl")

In [256]:
monroe_data_dev = monroe_data.MonroeData("dev_corpus_monroe.csv", single_speaker=True, ss_method="pool")
for _ in monroe_data_dev.read_data():
    pass
monroe_data_dev.save_entries("dev_entries_monroe.pkl")

In [87]:
monroe_data_train[:4]

[]

In [123]:
featurizer = CaptionFeaturizer(EndingTokenizer)

In [124]:
featurizer.construct_featurizer(monroe_data_train)

In [125]:
indices = []
captions = []

for entry in monroe_data_train:
    i, c = featurizer.to_string_features(entry.caption, construct=True)
    indices.append(i)
    captions.append(c)

In [126]:
indices[:4]

[['<sos>', 'the', 'dark', '+er', 'blue', 'one', '<eos>'],
 ['<sos>', 'purple', '<eos>'],
 ['<sos>', 'medium', 'pink', 'the', 'medium', 'dark', 'one', '<eos>'],
 ['<sos>', 'lime', '<eos>']]

In [69]:
captions[4]

[0, 6, 2]

In [127]:
featurizer.to_string_features("the bluest blue of the posedien")

(['<sos>', 'the', 'blu', '+est', 'blue', 'of', 'the', '<unk>', '<eos>'],
 [0, 1, 100, 20, 4, 22, 1, 11, 6])

In [128]:
featurizer.to_tensor("the bluest blue of the posedien")

tensor([[  0],
        [  1],
        [100],
        [ 20],
        [  4],
        [ 22],
        [  1],
        [ 11],
        [  6]])

In [101]:
def color_phi_id(color_list, space):
    """
    Function for turning a list of colors in the given space
    (with normalization marked by "_norm") into a feature function.
    
    This is just the identity feature function, so it's kind of boring,
    but we can imagine doing more complext things too (like fourier
    transform). We pass space as well in case you want to do different
    operations based on the space or only have a feature function work
    for HSL for example
    
    ex:
    color_list = [256, 0, 0] space = 'rgb'
    color_list = [1, 0, 0] space = 'rgb_norm'
    """
    return color_list

In [9]:
def color_phi_fourier(color_list, space, resolution=3):
    if space != "rgb_norm":
        print("Space must be rgb_norm to use fourier transform")
        return None

    resolution = [resolution for _ in color_list]
    colors = np.array([color_list])
    ranges = np.array([256, 256, 256])
    
    xyz = colors / 2

    ax, ay, az = [np.arange(0, g) for g, r in zip(resolution, ranges)]
    gx, gy, gz = np.meshgrid(ax, ay, az)

    arg = (np.multiply.outer(xyz[:, 0], gx) +
           np.multiply.outer(xyz[:, 1], gy) +
           np.multiply.outer(xyz[:, 2], gz))
    #assert arg.shape == (xyz.shape[0],) + tuple(self.resolution), arg.shape
    repr_complex = np.exp(-2j * np.pi * (arg % 1.0)).swapaxes(1, 2).reshape((xyz.shape[0], -1))
    result = np.hstack([repr_complex.real, repr_complex.imag]).astype(np.float32)
    return result[0]

In [10]:
color_phi_fourier([1., 0., 0.], 'rgb_norm', resolution=3)

[1.]


array([ 1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00,
        1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00,
        1.0000000e+00, -1.0000000e+00, -1.0000000e+00, -1.0000000e+00,
       -1.0000000e+00, -1.0000000e+00, -1.0000000e+00, -1.0000000e+00,
       -1.0000000e+00, -1.0000000e+00,  1.0000000e+00,  1.0000000e+00,
        1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  1.0000000e+00,
        1.0000000e+00,  1.0000000e+00,  1.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
       -1.2246469e-16, -1.2246469e-16, -1.2246469e-16, -1.2246469e-16,
       -1.2246469e-16, -1.2246469e-16, -1.2246469e-16, -1.2246469e-16,
       -1.2246469e-16,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00], dtype=float32)

In [179]:
RANGES_RGB = (256.0, 256.0, 256.0)
RANGES_HSV = (361.0, 101.0, 101.0)
C_EPSILON = 1e-4
Shsv = True
Sresolution = [3, 3, 3]
def vectorize_all(colors, hsv=None, Shsv = True):
    '''
    >>> normalize = lambda v: np.where(v.round(2) == 0.0, 0.0, v.round(2))
    >>> normalize(FourierVectorizer([2]).vectorize_all([(255, 0, 0), (0, 255, 255)]))
    array([[ 1.,  1.,  1.,  1., -1., -1., -1., -1.,  0.,  0.,  0.,  0.,  0.,
             0.,  0.,  0.],
           [ 1., -1., -1.,  1.,  1., -1., -1.,  1.,  0.,  0.,  0.,  0.,  0.,
             0.,  0.,  0.]], dtype=float32)
    '''
    if hsv is None:
        hsv = Shsv

    colors = np.array([colors])
    assert len(colors.shape) == 3, colors.shape
    assert colors.shape[2] == 3, colors.shape

    ranges = np.array(RANGES_HSV if Shsv else RANGES_RGB)
    print("ranges:", ranges)
    if hsv and not Shsv:
        print("Converting hsv to rgb")
        c_hsv = colors
        color_0_1 = skimage.color.hsv2rgb(c_hsv / (np.array(RANGES_HSV) - 1.0))
    elif not hsv and Shsv:
        print("Converting rgb to hsv")
        c_rgb = colors
        color_0_1 = skimage.color.rgb2hsv(c_rgb / (np.array(RANGES_RGB) - 1.0))
    else:
        print("Just normalize")
        color_0_1 = colors / (ranges - 1.0)

    print("Color (0-1):", color_0_1)
    # Using a Fourier representation causes colors at the boundary of the
    # space to behave as if the space is toroidal: red = 255 would be
    # about the same as red = 0. We don't want this...
    xyz = color_0_1[0] / 2.0
    if Shsv:
        # ...*except* in the case of HSV: H is in fact a polar coordinate.
        xyz[:, 0] *= 2.0

    # ax, ay, az = [np.hstack([np.arange(0, g / 2), np.arange(r - g / 2, r)])
    #               for g, r in zip(self.resolution, ranges)]
    ax, ay, az = [np.arange(0, g) for g, r in zip(Sresolution, ranges)]
    gx, gy, gz = np.meshgrid(ax, ay, az)

    arg = (np.multiply.outer(xyz[:, 0], gx) +
           np.multiply.outer(xyz[:, 1], gy) +
           np.multiply.outer(xyz[:, 2], gz))
    assert arg.shape == (xyz.shape[0],) + tuple(Sresolution), arg.shape
    repr_complex = np.exp(-2j * np.pi * (arg % 1.0)).swapaxes(1, 2).reshape((xyz.shape[0], -1))
    result = np.hstack([repr_complex.real, repr_complex.imag]).astype(np.float32)
    return result

In [180]:
skimage.color.rgb2hsv([[(1., 0., 0.)]])

array([[[0., 1., 1.]]])

In [196]:
normalize(color_phi_fourier([1., 0.5, 0.25], 'rgb_norm', resolution=3))

Colors: [[1.   0.5  0.25]]


array([[ 1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,
        -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ,  1.  ,  0.71,  0.  ,
         1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,
         0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ,
         0.  ,  0.71,  1.  ,  1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  ,
         0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ]],
      dtype=float32)

In [198]:
normalize(vectorize_all([(20, 75, 100)], hsv=True, Shsv=False))

ranges: [256. 256. 256.]
Converting hsv to rgb
Color (0-1): [[[1.   0.5  0.25]]]


array([[ 1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,
        -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ,  1.  ,  0.71,  0.  ,
         1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,
         0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ,
         0.  ,  0.71,  1.  ,  1.  ,  0.71,  0.  ,  0.  , -0.71, -1.  ,
         0.  , -0.71, -1.  , -1.  , -0.71,  0.  ,  0.  ,  0.71,  1.  ]],
      dtype=float32)

In [173]:
normalize(vectorize_all([(0, 100., 100.)], hsv=True, Shsv=True))

ranges: [361. 101. 101.]
Color (0-1): [[[0. 1. 1.]]]


array([[ 1., -1.,  1., -1.,  1., -1.,  1., -1.,  1.,  1., -1.,  1., -1.,
         1., -1.,  1., -1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1., -1.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.]], dtype=float32)

In [152]:
normalize = lambda v: np.where(v.round(2) == 0.0, 0.0, v.round(2))

In [158]:
normalize(vectorize_all([(255, 0, 0)], hsv=False))

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.]], dtype=float32)

In [157]:
a.round(2)

array([1., 2., 4.])

In [154]:
a = np.array([1.0001, 2, 4])

In [143]:
a

array([1, 2, 4])

In [144]:
a[1:]/4

array([0.5, 1. ])

In [135]:
class ColorFeaturizer:
    def __init__(self, featurizer, space, **kwargs):
        self.featurizer = featurizer
        self.space = space
        self.featurizer_kwargs = kwargs
    
    
    def to_color_lists(self, colors, normalized):
        # non-standard, but use the space as the variable name
        # to access the color attribute directly
        class_var_name = self.space
        if normalized:
            class_var_name = "{}_norm".format(class_var_name)
        return [color.__dict__[class_var_name] for color in colors], class_var_name
    
    def to_tensor(self, colors, normalized=True):
        """
        Convert colors to tensors where the vectors are the given by applying
        the feature function self.featurizer to the colors 

        returns all colors as |colors| x |phi| matrix
        """
        color_lists, space = self.to_color_lists(colors, normalized) 
        color_lists = [self.featurizer(color_list, space) for color_list in color_lists]
        target = color_lists[0] # target is always first color
        color_tensor = torch.tensor(color_lists, dtype=torch.float) # to get column vectors
        return color_tensor
    
    def shuffle_colors(self, color_tensor):
        """
        Randomly permute colors. Keep track of where the the target ends up
        for training and error analysis
        """
        permutation = torch.randperm(color_tensor.size(0))
        target = torch.argmin(permutation).view(-1) # target always started at 0
        return color_tensor[permutation], target


In [136]:
color_featurizer = ColorFeaturizer(color_phi_id, "rgb")

In [137]:
test_color_tensor = color_featurizer.to_tensor(monroe_data_train.entries[0].colors, normalized = False)

In [111]:
monroe_data_train.entries[0].colors

[hsl: [226, 81, 50], rgb [24, 73, 232],
 hsl: [283, 87, 50], rgb [176, 17, 239],
 hsl: [248, 92, 50], rgb [42, 10, 246]]

In [138]:
test_color_tensor

tensor([[ 24.,  73., 232.],
        [176.,  17., 239.],
        [ 42.,  10., 246.]])

In [139]:
color_featurizer.shuffle_colors(test_color_tensor)

(tensor([[176.,  17., 239.],
         [ 42.,  10., 246.],
         [ 24.,  73., 232.]]), tensor([2]))

In [206]:
color_featurizer_fourier = ColorFeaturizer(color_phi_fourier, "rgb")

In [207]:
test_color_tensor = color_featurizer_fourier.to_tensor(monroe_data_train.entries[0].colors, normalized = True)

In [210]:
test_color_tensor

tensor([[ 1.0000e+00, -9.5694e-01,  8.3147e-01,  6.2486e-01, -8.2459e-01,
          9.5331e-01, -2.1910e-01, -7.3565e-02,  3.5990e-01,  9.5694e-01,
         -1.0000e+00,  9.5694e-01,  3.7132e-01, -6.2486e-01,  8.2459e-01,
         -4.9290e-01,  2.1910e-01,  7.3565e-02,  8.3147e-01, -9.5694e-01,
          1.0000e+00,  8.5797e-02, -3.7132e-01,  6.2486e-01, -7.2425e-01,
          4.9290e-01, -2.1910e-01,  0.0000e+00, -2.9028e-01,  5.5557e-01,
         -7.8074e-01,  5.6573e-01, -3.0201e-01, -9.7570e-01,  9.9729e-01,
         -9.3299e-01, -2.9028e-01, -1.2246e-16,  2.9028e-01, -9.2851e-01,
          7.8074e-01, -5.6573e-01, -8.7009e-01,  9.7570e-01, -9.9729e-01,
         -5.5557e-01,  2.9028e-01,  0.0000e+00, -9.9631e-01,  9.2851e-01,
         -7.8074e-01, -6.8954e-01,  8.7009e-01, -9.7570e-01],
        [ 1.0000e+00, -9.7832e-01,  9.1421e-01,  9.7832e-01, -1.0000e+00,
          9.7832e-01,  9.1421e-01, -9.7832e-01,  1.0000e+00, -5.5557e-01,
          3.7132e-01, -1.7096e-01, -7.1573e-01,  5

In [215]:
normalize(test_color_tensor[0].numpy())

array([ 1.  , -0.96,  0.83,  0.62, -0.82,  0.95, -0.22, -0.07,  0.36,
        0.96, -1.  ,  0.96,  0.37, -0.62,  0.82, -0.49,  0.22,  0.07,
        0.83, -0.96,  1.  ,  0.09, -0.37,  0.62, -0.72,  0.49, -0.22,
        0.  , -0.29,  0.56, -0.78,  0.57, -0.3 , -0.98,  1.  , -0.93,
       -0.29,  0.  ,  0.29, -0.93,  0.78, -0.57, -0.87,  0.98, -1.  ,
       -0.56,  0.29,  0.  , -1.  ,  0.93, -0.78, -0.69,  0.87, -0.98],
      dtype=float32)

In [219]:
monroe_data_train.entries[0].colors[0].hsl

[226, 81, 50]

In [214]:
test_color_tensor[0].numpy()

array([ 1.0000000e+00, -9.5694035e-01,  8.3146960e-01,  6.2485951e-01,
       -8.2458931e-01,  9.5330602e-01, -2.1910124e-01, -7.3564567e-02,
        3.5989505e-01,  9.5694035e-01, -1.0000000e+00,  9.5694035e-01,
        3.7131721e-01, -6.2485951e-01,  8.2458931e-01, -4.9289820e-01,
        2.1910124e-01,  7.3564567e-02,  8.3146960e-01, -9.5694035e-01,
        1.0000000e+00,  8.5797310e-02, -3.7131721e-01,  6.2485951e-01,
       -7.2424710e-01,  4.9289820e-01, -2.1910124e-01,  0.0000000e+00,
       -2.9028466e-01,  5.5557024e-01, -7.8073722e-01,  5.6573182e-01,
       -3.0200595e-01, -9.7570211e-01,  9.9729043e-01, -9.3299282e-01,
       -2.9028466e-01, -1.2246469e-16,  2.9028466e-01, -9.2850608e-01,
        7.8073722e-01, -5.6573182e-01, -8.7008697e-01,  9.7570211e-01,
       -9.9729043e-01, -5.5557024e-01,  2.9028466e-01,  0.0000000e+00,
       -9.9631262e-01,  9.2850608e-01, -7.8073722e-01, -6.8954057e-01,
        8.7008697e-01, -9.7570211e-01], dtype=float32)

In [47]:
# torchtext trial
from torchtext.vocab import Vocab
from torchtext import data

In [19]:
monroe_data_train = MonroeData("../data/csv/train_corpus_monroe.csv", "../data/entries/train_entries_monroe.pkl")

In [21]:
tokenizer = EndingTokenizer()
word_count = Counter()
for entry in monroe_data_train:
    caption_tokens = tokenizer.tokenize(entry.caption)
    for token in caption_tokens:
        word_count[token] += 1

In [23]:
word_count.most_common()

[('green', 3510),
 ('the', 2925),
 ('blue', 2665),
 ('purple', 2392),
 ('~', 1994),
 ('bright', 1824),
 ('+er', 1712),
 ('not', 1483),
 ('+ish', 1452),
 ('pink', 1397),
 ('grey', 1393),
 ('dark', 1247),
 ('one', 1239),
 ('+est', 1236),
 (',', 1090),
 ('gray', 774),
 ('.', 743),
 ('yellow', 740),
 ('color', 706),
 ('light', 693),
 ('of', 655),
 ('is', 626),
 ('brown', 620),
 ('red', 582),
 ('orange', 515),
 ('dull', 496),
 ('The', 461),
 ('more', 424),
 ('a', 412),
 ('that', 333),
 ('or', 293),
 ('most', 284),
 ('it', 266),
 ('neon', 257),
 ('and', 257),
 ('with', 251),
 ('to', 250),
 ('teal', 245),
 ('like', 239),
 ('?', 222),
 ('sky', 204),
 ('lime', 185),
 ('!', 185),
 ('olive', 181),
 ('but', 179),
 ('...', 178),
 ('tan', 177),
 ('shade', 174),
 ('target', 174),
 ('blu', 171),
 ("'s", 168),
 ('in', 166),
 ('two', 151),
 ('this', 134),
 ('box', 129),
 (')', 127),
 ('you', 124),
 ('hot', 124),
 ('grass', 121),
 ('no', 120),
 ('oth', 119),
 ('on', 103),
 ('aqua', 103),
 ('its', 102),
 

In [32]:
tt_vocab = Vocab(word_count, specials=['<s>' '</s>', '<unk>'])

In [34]:
tt_vocab.

0

In [36]:
data_field = data.Field(init_token='<s>', eos_token='</s>', tokenize=tokenizer.tokenize, lower=True)

In [37]:
train_captions = [de.caption for de in monroe_data_train]

In [49]:
data.Example.fromlist(train_captions, fields=data_field)

TypeError: zip argument #1 must support iteration

In [56]:
train_ttdataset = data.TabularDataset(path="../data/csv/train_corpus_monroe.csv", format="csv", fields=[('caption', data_field)])

In [58]:
data_field.build_vocab(train_ttdataset, min_freq=2)

In [60]:
data_field.numericalize(["<s>", "the", "<unk>", "blue", "one", "</s>"])

ValueError: expected sequence of length 3 at dim 1 (got 5)

In [41]:
data_field.build_vocab(train_captions, min_freq=2)

In [44]:
data

TypeError: object of type 'Field' has no len()

In [42]:
data_field.numericalize("the darker blue one")

tensor([[ 7, 13,  5,  4, 20, 14,  6, 21,  5,  6,  4, 18,  8, 15,  5,  4,  9, 10,
          5]])

In [None]:
def construct_featurizer(self, data_entries, construct_idx=True):
    """
    data_entries is of type MonroeData.
    """
    self.word_count = Counter()
    for entry in data_entries:
        caption_tokens = self.tokenizer.tokenize(entry.caption)
        for token in caption_tokens:
            self.word_count[token] += 1

    if construct_idx:
        # just construct the index so we don't have to worry about calling
        # anything with the construct=True argument to to_string_features
        for entry in data_entries:
            _ = self.to_string_features(entry.caption, construct=True)
        self.initialized = True

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, lowercase=True, min_df=2)

In [66]:
X = vectorizer.fit_transform(train_captions)

In [67]:
X.shape

(15665, 994)