# Data Processing

# **0. Set Up**

Before diving in the models, let's reload the notebook to keep it updated.

In [1]:
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.style.use('seaborn-darkgrid')

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Then, let's load the dependencies:

In [2]:
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from torchtext import datasets

from torch.nn.utils.rnn import pad_packed_sequence, PackedSequence
from torch.utils.data import DataLoader

# Data science
import spacy
import numpy as np
import time
import random

In [3]:
#Let's not forget to fix the seed for random generated numbers !
SEED = 2020 

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# **Baseline**


# 1. Dataset

### 1.1. NoReC fine

In [4]:
from torchtext.datasets import SequenceTaggingDataset


class NoReCfine(SequenceTaggingDataset):
    def __init__(self, path, fields, encoding="utf-8", separator="\t", **kwargs):
        super().__init__(path, fields)

    @classmethod
    def splits(cls, fields, train_data="data/train.conll", dev_data="data/dev.conll", test_data="data/test.conll"):
        return NoReCfine(train_data, fields), NoReCfine(dev_data, fields), NoReCfine(test_data, fields)

In [5]:
import torchtext

TEXT = torchtext.data.Field(lower=False, include_lengths=True, batch_first=True)
LABEL = torchtext.data.Field(batch_first=True, unk_token=None)
FIELDS = [("text", TEXT), ("label", LABEL)]

train_data, eval_data, test_data = NoReCfine.splits(FIELDS)

In [6]:
print(f'Number of training examples: {len(train_data):,}')
print(f'Number of training examples: {len(eval_data):,}')
print(f'Number of testing examples:    {len(test_data)}')

text_length = [len(sentence) for sentence in list(train_data.text)]

print(f"\nNumber of sentences in train_data.text: {len(text_length)}")
print(f'Number of words in train_data: {sum(text_length):,}')

Number of training examples: 5,915
Number of training examples: 1,151
Number of testing examples:    895

Number of sentences in train_data.text: 5915
Number of words in train_data: 98,483


In [7]:
print("What's inside the training data:")
print(vars(train_data[0]))

What's inside the training data:
{'text': ['Lite', 'tight', 'Tempah'], 'label': ['O', 'O', 'B-targ-Negative']}


### 1.2. Vocabulary

In [8]:
from torchtext.vocab import Vectors

VOCAB_SIZE = 1_200_000
VECTORS = Vectors(name='model.txt')

# Create the vocabulary for words embeddings
TEXT.build_vocab(train_data, 
                 max_size = VOCAB_SIZE, 
                 vectors = VECTORS, 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

### 1.2. Data Analysis

In [9]:
VECTORS.vectors.shape

torch.Size([1182371, 100])

In [10]:
TEXT.vocab.vectors.shape

torch.Size([19192, 100])

In [11]:
print("Vocabulary defined on the training data, with the help of pre-trained embeddings:\n")

print(TEXT.vocab.itos[0:20])

Vocabulary defined on the training data, with the help of pre-trained embeddings:

['<unk>', '<pad>', '.', ',', 'og', 'er', 'i', 'som', 'en', 'det', 'på', 'å', 'av', 'med', 'til', 'for', '«', '»', 'har', 'den']


In [12]:
print(f"The word '{TEXT.vocab.itos[8867]}' is not part of the pre-trained embeddings.\n")

print(f"PyTorch sets its vector to zero:\n{TEXT.vocab.vectors[8867]}")

The word 'Martel' is not part of the pre-trained embeddings.

PyTorch sets its vector to zero:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


In [13]:
print("Vocabulary informations:\n")
print(f"Labels vocabulary:\n{LABEL.vocab.itos}\n")
print(f"Text vocabulary:\n{TEXT.vocab.itos[:10]}\n")
print(f"Most frequent words:\n{TEXT.vocab.freqs.most_common(10)}")

Vocabulary informations:

Labels vocabulary:
['<pad>', 'O', 'I-targ-Positive', 'B-targ-Positive', 'I-targ-Negative', 'B-targ-Negative']

Text vocabulary:
['<unk>', '<pad>', '.', ',', 'og', 'er', 'i', 'som', 'en', 'det']

Most frequent words:
[('.', 4585), (',', 4085), ('og', 2878), ('er', 2326), ('i', 2071), ('som', 1699), ('en', 1628), ('på', 1318), ('det', 1318), ('å', 1222)]


# 2. Processing