In [18]:
# The aim of this code is to convert the CSV file of
# labels (= missing words), and sentences with missing words
# into a tensor of numbers that can be passed through
# the matching networks code just like the numpy array
# used for the images in the original code based on the
# Onmiglot dataset

# The numbers in the tensor will be the numbers each word
# and refer to in the vocabulary we are building
# We will not embed at this stage because this
# is done inside the matching network. We are, in effect,
# not completing the TorchText proprocessing

# This code is a mainly a mixture of two tutorials:
# http://anie.me/On-Torchtext/
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

# Comments are a mixture of those from the tutorials (most of them)
# and my own

# Note to self, use Conda environment PyTorch1

In [19]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext import data
from torchtext.data import Iterator
import spacy

In [20]:
# Check what the data looks like
# train2.csv is the same as train.csv, but without the column titles

pd.read_csv("data/train2.csv").head(20)

Unnamed: 0,borgnine,hank azaria who provides the voice of apu commented that <blank_token> had no idea what the hell he was doing
0,borgnine,the unseen person or creature that attacks <bl...
1,borgnine,meanwhile the other junior campers led by er...
2,borgnine,<blank_token> was a guitar player in real life...
3,borgnine,seeing ned flanders get it wrong is great but ...
4,borgnine,rick porter of < unk > N it wrote in that he w...
5,borgnine,meanwhile the other junior campers led by er...
6,borgnine,ernest <blank_token> guest starred in the epis...
7,borgnine,<blank_token> apologized because he felt that ...
8,borgnine,in her book my life as a N year old boy cart...
9,tackles,tadman had seven solo <blank_token> three assi...


In [21]:
# Use spacy to define a function to 
# tokenize, or split up, into individual words,
# the labels and sentences. Note the labels are already
# individual words

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# We define a Field. This is a class that contains
# information on how you want the data preprocessed. It acts
# like an instruction manual that data.TabularDataset will use.
# We define two fields, one for the sentences and one for the
# labels

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, is_target=True)

type(TEXT)

torchtext.data.field.Field

In [22]:
# The fields know what to do when given raw data.
# Now, we need to tell the fields what data they
# should work on. This is where we use datasets.

# The splits method creates a dataset for the train
# and test data by applying the same processing.

# I assume this next code is OK,
# even though I don't have the column titles `label`
# or 'sentence'. It still seems to work. Before taking them
# out the data, torch text would include 'label' as a label
# and 'sentence' as a sentence.

train, test = data.TabularDataset.splits(
        path='data/', train='train2.csv', test='test2.csv', format='csv',
        fields=[('label', LABEL), ('sentence', TEXT)]) 

In [23]:
# Sense check

print(type(train))

<class 'torchtext.data.dataset.TabularDataset'>


In [24]:
# Torchtext handles mapping words to integers, but
# it has to be told the full range of words it should
# handle.

# We are currently building the vocab from the train
# and test data. This might be incorrect.

TEXT.build_vocab(train, test)
LABEL.build_vocab(train, test)

# This makes torchtext go through all the elements in the
# training and test sets, check the contents corresponding to the TEXT
# field, and register the words in its vocabulary. Torchtext
# has its own class called Vocab for handling the vocabulary.
# The Vocab class holds a mapping from word to id in its stoi
# attribute and a reverse mapping in its itos attribute.

In [25]:
vocab = LABEL.vocab
print(type(vocab.stoi))
print(len(vocab.stoi))
print(vocab.stoi)

<class 'collections.defaultdict'>
9287


defaultdict(<function _default_unk_index at 0x00000270AADFAEA0>, {'<unk>': 0, "'re": 1, "'s": 2, '4th': 3, '9th': 4, 'abandoned': 5, 'accepted': 6, 'act': 7, 'active': 8, 'actually': 9, 'ad': 10, 'additionally': 11, 'administration': 12, 'adoption': 13, 'advanced': 14, 'advantage': 15, 'advice': 16, 'affairs': 17, 'affected': 18, 'africa': 19, 'african': 20, 'ahead': 21, 'airport': 22, 'allowed': 23, 'allows': 24, 'alongside': 25, 'although': 26, 'america': 27, 'american': 28, 'an': 29, 'analysis': 30, 'ancient': 31, 'angle': 32, 'animals': 33, 'annual': 34, 'appear': 35, 'appears': 36, 'aquatic': 37, 'archaeological': 38, 'archbishop': 39, 'architects': 40, 'areas': 41, 'armed': 42, 'armoured': 43, 'army': 44, 'art': 45, 'as': 46, 'aside': 47, 'aspects': 48, 'association': 49, 'attacking': 50, 'attained': 51, 'attempted': 52, 'attempts': 53, 'attention': 54, 'audience': 55, 'australia': 56, 'australian': 57, 'austria': 58, 'authority': 59, 'award': 60, 'awards': 61, 'away': 62, 'band'

In [29]:
vocab = TEXT.vocab
print(type(vocab.stoi))
print(len(vocab.stoi))

<class 'collections.defaultdict'>
27466


In [30]:
# In torchvision and PyTorch, the processing and batching of
# data is handled by DataLoaders. For some reason, torchtext
# has renamed the objects that do the exact same thing to
# Iterators. The basic functionality is the same

train_iter, test_iter = Iterator.splits(
        (train, test),
    
        # (90000,10000) means 90000 for train and 10000 for test,
        # the number of examples in each
        # That is, we only want to create one "batch" for each
        # as we are only doing this process in TorchText to convert
        # our data into a PyTorch tensor and then numpy array
        # object to be passed around
        # the matching networks program in the same way the
        # vision data was passed around in a numpy array
        # The matching networks program already takes care
        # of batching and we don't want to distrub things too much
    
        batch_sizes=(90000,10000))

In [31]:
# Look at the batch

# batch = next(train_iter.__iter__()); batch

In [32]:
type(train_iter.__iter__())

generator

In [33]:
# Currently, the iterator returns a custom datatype
# called torchtext.data.Batch.
# we’ll convert the batch to a tuple in the form
# (x, y) where x is the label tensor
# and y is the sentence

class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_var) # we assume only one input in this wrapper

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [37]:
# Running the above BatchWrapper

train_dl = BatchWrapper(train_iter, "label", "sentence")
test_dl = BatchWrapper(test_iter, "label", "sentence")

print(type(train_dl))

<class '__main__.BatchWrapper'>


In [38]:
# Extracting the tensors from the BatchWrapper
# There is a tuple of tensors, one for the labels
# and one for the sentences

# X_test throws an error here.

X_train = next(train_dl.__iter__())
#X_test = next(test_dl.__iter__())

In [39]:
# If you print out the resulting tensor, you see
# TorchText has shuffled the examples, even though
# the defaults are not to shuffle. Even explicitly passing
# arguments not to shuffle does nothing. Here we
# rectify this, sorting the labels, and then
# sorting the tensor columns by the same
# permutaion as the labels

X_train_labels = X_train[0].numpy()
X_train_sentences = X_train[1].numpy()
p = X_train_labels.argsort()
X_train_labels = X_train_labels[p]
X_train_sentences = X_train_sentences[:,p]
print(X_train_sentences.shape)
print(X_train_labels.shape)

# X_test_labels = X_test[0].numpy()
# X_test_sentences = X_test[1].numpy()
# q = X_test_labels.argsort()
# X_test_labels = X_test_labels[q]
# X_test_sentences = X_test_sentences[:,q]
# print(X_test_sentences.shape)
# print(X_test_labels.shape)

(219, 90000)
(90000,)


In [40]:
# Taking tranpose to get data closer to form
# expected by matching network

X_train_sentences = X_train_sentences.transpose()
print(X_train_sentences.shape)

# X_test_sentences = X_test_sentences.transpose()
# print(X_test_sentences.shape)

(90000, 219)


In [41]:
# Reshaping sentences to be of dimension
# (examples per class, number of classes, length of longest sentence)

X_train_sentences = np.reshape(X_train_sentences, (10,9000,219))
print(X_train_sentences.shape)

# X_test_sentences = np.reshape(X_test_sentences, (10,9000,219))
# print(X_test_sentences.shape)

(10, 9000, 219)


In [42]:
# Reshaping sentences to be of dimension
# (number of classes, examples per class, length of longest sentence)

X_train_sentences = X_train_sentences.transpose((1, 0, 2))
print(X_train_sentences.shape)

# X_test_sentences = X_test_sentences.transpose((1, 0, 2))
# print(X_test_sentences.shape)

(9000, 10, 219)


In [43]:
# Reshaping labels to be of dimension
# (examples per class, number of classes)

X_train_labels = np.reshape(X_train_labels, (10,9000))
print(X_train_labels.shape)

# X_test_labels = np.reshape(X_test_labels, (10,9000))
# print(X_test_labels.shape)

(10, 9000)


In [44]:
# Reshaping labels to be of dimension
# (number of classes, examples per class)

X_train_labels = X_train_labels.transpose()
print(X_train_labels.shape)

# X_test_labels = X_test_labels.transpose()
# print(X_test_labels.shape)

(9000, 10)


In [66]:
# Saving data

np.save('X_train_sentences.npy', X_train_sentences)
np.save('X_train_labels.npy', X_train_labels)

# np.save('X_test_sentences.npy', X_test_sentences)
# np.save('X_test_labels.npy', X_test_labels)

In [None]:
# Some experiments

In [None]:
# Reshape

In [None]:
# We convert the tensors to numpy arrays
# for the Matching Networks code so we  don't
# have to change everything that was for numpy arrays
# to PyTorch tensors. We could could then convert
# back to Tensors when we need them

# Cell incomplete, issue with size of train. Should be 90,000, not 9,000 long

X_train = X_train.reshape()
Y_train = Y_train.reshape()
X_test = X_test.reshape()
Y_test = Y_test.reshape()

In [None]:
a = np.array([[1,2,3,4,5,6], [7,8,9,10,11,12], [13,14,15,16,17,18], [19,20,21,22,23,24], 
              [25,26,27,28,29,30], [31,32,33,34,35,36], [37,38,39,40,41,42], [43,44,45,46,47,48]])
print(a)

In [None]:
a = np.reshape(a, (2,4,6))
print(a)

In [None]:
# One hot encoding

In [3]:
batch_size = 5
nb_digits = 10
# Dummy input that HAS to be 2D for the scatter (you can use view(-1,1) if needed)
y = torch.LongTensor(batch_size,1).random_() % nb_digits
# One hot encoding buffer that you create out of the loop and just keep reusing
y_onehot = torch.FloatTensor(batch_size, nb_digits)

# In your for loop
y_onehot.zero_()
y_onehot.scatter_(1, y, 1)

print(y)
print(y_onehot)

tensor([[6],
        [9],
        [4],
        [8],
        [5]])
tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])


In [76]:
a_orig = np.array([[1, 7, 5, 3],
           [2, 4, 1, 4]])
a_orig = a_orig.transpose()
print(a_orig)

[[1 2]
 [7 4]
 [5 1]
 [3 4]]


In [77]:
type(int(np.amax(a_orig)))

int

In [78]:
a = (np.arange(a_orig.max()) == a_orig[...,None]-1).astype(int)
print(a.shape)
print(a)

(4, 2, 7)
[[[1 0 0 0 0 0 0]
  [0 1 0 0 0 0 0]]

 [[0 0 0 0 0 0 1]
  [0 0 0 1 0 0 0]]

 [[0 0 0 0 1 0 0]
  [1 0 0 0 0 0 0]]

 [[0 0 1 0 0 0 0]
  [0 0 0 1 0 0 0]]]


In [79]:
a = a.sum(0)
print(a.shape)
print(a)

(2, 7)
[[1 0 1 0 1 0 1]
 [1 1 0 2 0 0 0]]


In [None]:
# Another one hot encoding, removing padding (ones)

In [89]:
active_tokens_mask = (a_orig != 1)
print(active_tokens_mask)
filtered = active_tokens_mask * a_orig
print(filtered)

X_Train_onehot = np.zeros((8,a_orig.shape[1]))

for c in range(len(a_orig[1])):
    max_pool = np.zeros((8))
    max_pool[filtered[:,c]] = 1
    max_pool[0]=0
#     print(X_Train_onehot[:,c].shape)
#     print(max_pool.shape)
    X_Train_onehot[:,c] = max_pool
    
print(X_Train_onehot)

[[False  True]
 [ True  True]
 [ True False]
 [ True  True]]
[[0 2]
 [7 4]
 [5 0]
 [3 4]]
[[0. 0.]
 [0. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 0.]
 [1. 0.]]
