# Assignment 2 - Part 2
POS Tagging with feed forward Neural Network

In [1]:
!which python3

/home/development/arif/.conda/envs/stable/bin/python3


In [2]:
! which pip

/home/development/arif/.conda/envs/stable/bin/pip


## Importing Libraries

In [3]:
import nltk
from nltk.corpus import brown
import numpy as np
import torch
import random
import os
from torch import nn
from tqdm import tqdm

Seeding randomness

In [4]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random.seed(seed)

In [5]:
len(brown.sents())

57340

In [6]:
len(set(brown.words()))

56057

In [7]:
dataset = brown.tagged_sents(tagset='universal')
print(len(dataset))
for sent in dataset[:10]:
    print(sent)

57340
[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
[('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('

## Data Exploration

In [8]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    return words

In [9]:
_, tags = zip(*[pair for sent in dataset for pair in sent])
universal_tags = list(set(tags))
del tags
print(f"POS tags are:\n{universal_tags}\nNumber of POS tags are: {len(universal_tags)}")

POS tags are:
['NUM', 'DET', 'ADJ', 'ADV', 'PRT', '.', 'CONJ', 'VERB', 'X', 'ADP', 'NOUN', 'PRON']
Number of POS tags are: 12


In [10]:
from nltk.tag import untag

Seperating sentences and their tags

In [11]:
sentences = [untag(sent) for sent in dataset]
pos_tags = [list(list(zip(*sent))[1]) for sent in dataset]

len(sentences), len(pos_tags)

(57340, 57340)

Looking at data

In [12]:
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = dataset.iloc[picks]
    display(HTML(df.to_html()))

In [13]:
df = pd.DataFrame.from_dict({
    "sentences" : sentences,
    "pos_tags" : pos_tags
})
show_random_elements(df)

Unnamed: 0,sentences,pos_tags
41905,"[When, Linda, Kay, had, put, up, her, breakfast, dishes, and, mopped, her, linoleum, rugs, ,, she, would, go, to, the, Big, House, .]","[ADV, NOUN, NOUN, VERB, VERB, PRT, DET, NOUN, NOUN, CONJ, VERB, DET, NOUN, NOUN, ., PRON, VERB, VERB, ADP, DET, ADJ, NOUN, .]"
7296,"[The, critics', campaign, finally, inspired, the, first, major, U.S., exhibit, of, Schiele's, works, .]","[DET, NOUN, NOUN, ADV, VERB, DET, ADJ, ADJ, NOUN, NOUN, ADP, NOUN, NOUN, .]"
1639,"[You, can, get, into, an, argument, about, fallout, shelters, at, the, drop, of, a, beer, stein, in, clubs, and, pubs, these, nights, .]","[PRON, VERB, VERB, ADP, DET, NOUN, ADP, NOUN, NOUN, ADP, DET, NOUN, ADP, DET, NOUN, NOUN, ADP, NOUN, CONJ, NOUN, DET, NOUN, .]"
48598,"[``, To, me, you'll, always, be, the, girl, o', my, dreams, ,, an', the, sweetest, flower, that, grows, '', .]","[., ADP, PRON, PRT, ADV, VERB, DET, NOUN, ADP, DET, NOUN, ., CONJ, DET, ADJ, NOUN, DET, VERB, ., .]"
18024,"[Another, poultice, was, made, from, the, inner, bark, of, the, elm, tree, ,, steeped, in, water, until, it, formed, a, sticky, ,, gummy, solution, .]","[DET, NOUN, VERB, VERB, ADP, DET, ADJ, NOUN, ADP, DET, NOUN, NOUN, ., VERB, ADP, NOUN, ADP, PRON, VERB, DET, ADJ, ., ADJ, NOUN, .]"
16049,"[When, a, husband, is, sexually, selfish, and, heedless, of, his, wife's, desires, ,, she, is, cheated, of, the, fulfillment, and, pleasure, nature, intended, for, her, .]","[ADV, DET, NOUN, VERB, ADV, ADJ, CONJ, ADJ, ADP, DET, NOUN, NOUN, ., PRON, VERB, VERB, ADP, DET, NOUN, CONJ, NOUN, NOUN, VERB, ADP, PRON, .]"
14628,"[These, include, :, leaves, of, absences, ,, illnesses, ,, and, layoffs, .]","[DET, VERB, ., NOUN, ADP, NOUN, ., NOUN, ., CONJ, NOUN, .]"
9144,"[More, and, more, boats, move, overland, on, wheels, (, 1.8, million, trailers, are, now, in, use, ), and, Midwesterners, taking, long, weekends, can, travel, south, with, their, craft, .]","[ADJ, CONJ, ADJ, NOUN, VERB, ADV, ADP, NOUN, ., NUM, NUM, NOUN, VERB, ADV, ADP, NOUN, ., CONJ, NOUN, VERB, ADJ, NOUN, VERB, VERB, NOUN, ADP, DET, NOUN, .]"
48265,"[Why, had, I, registered, ?, ?]","[ADV, VERB, PRON, VERB, ., .]"
6717,"[At, Khrushchev's, door, ,, therefore, ,, can, be, placed, the, primary, blame, but, also, at, fault, are, those, who, permitted, themselves, to, be, intimidated, .]","[ADP, NOUN, NOUN, ., ADV, ., VERB, VERB, VERB, DET, ADJ, NOUN, CONJ, ADV, ADP, NOUN, VERB, DET, PRON, VERB, PRON, PRT, VERB, VERB, .]"


## Download word vectors

In [14]:
# import urllib.request
# import zipfile

# url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
# filename = "crawl-300d-2M.vec.zip"

# # Download the zip file
# urllib.request.urlretrieve(url, filename)

# # Extract the contents of the zip file
# with zipfile.ZipFile(filename, 'r') as zip_ref:
#     zip_ref.extractall('.')

In [15]:
with open("crawl-300d-2M.vec", "r") as vectors_file:
    file_content = vectors_file.read().splitlines()

In [16]:
print(f"1st line in the above file, file_content tells the number of words for which the file has embeddings followed by the dimensions of the embeddings.")
print(file_content[0])

1st line in the above file, file_content tells the number of words for which the file has embeddings followed by the dimensions of the embeddings.
1999995 300


In [17]:
print(f"The rest of the lines have a word at every line followed by 300 numbers for the 300d word embeddings of the word, a sample is shown below.")
print(file_content[424242])

The rest of the lines have a word at every line followed by 300 numbers for the 300d word embeddings of the word, a sample is shown below.
Wreck-It -0.2941 -0.0440 0.6280 0.3474 0.5014 -1.0144 0.5379 -0.2273 -0.0119 -0.2189 0.0294 1.2556 -0.1439 0.2246 0.0659 -0.0031 0.1349 0.3719 -0.5852 -0.3380 0.4387 0.4592 0.3349 -0.4750 -0.3464 0.6705 -0.5252 0.3864 -0.1200 0.1228 -0.1977 0.9876 0.3050 0.2154 0.8681 -0.2432 0.0239 -0.2227 -0.3440 0.5673 0.7380 0.1509 0.0166 0.6669 -0.4747 0.2333 0.0375 0.7208 -0.4009 -0.3855 0.2377 0.2651 1.1744 -0.5010 -0.0412 0.3804 0.2547 0.2091 -0.4663 0.0147 -0.5130 -0.1848 0.1726 0.3357 0.8713 0.7080 0.8966 -0.8118 0.0568 -0.5825 -0.6531 0.2765 -0.1806 0.2960 0.4276 0.3042 -0.2592 0.2431 -0.3561 -0.0708 0.5813 0.2686 0.4225 -0.1953 0.4349 -0.3406 0.0300 -0.5246 -0.1651 -0.4520 0.1233 0.1704 -0.0815 0.2655 -0.2337 -0.4135 -0.0224 0.5407 0.0084 -0.2461 -0.2733 0.2132 -0.0585 -0.1620 0.3127 -0.3421 -0.7229 0.6596 0.4628 -0.2809 -0.4614 0.0984 0.4248 -0.7097 -0.

Making a dictionary mapping from the word to the embedding, called `word2embedding`.

In [18]:
word2embedding = dict()
for line in file_content[1:]:
    word2embedding[line.split()[0]] = torch.tensor(list(map(float, line.split()[1:])))
    
vocab = list(word2embedding.keys())

As `file_content` is a large variable that is no longer needed, we delete it.

In [19]:
del file_content

Checking for duplication is vocabulary `vocab`

In [20]:
len(vocab), len(set(vocab))

(1999995, 1999995)

Notice that same word can have different embeddings for its capitalized and non-capitalized versions.

In [21]:
word2embedding['The']

tensor([ 3.4100e-02,  2.3550e-01, -6.3600e-02, -2.6600e-02,  3.9000e-02,
         1.8200e-02,  1.5850e-01, -3.9070e-01, -4.3700e-02, -4.8400e-02,
        -1.0740e-01,  8.3800e-02, -2.5350e-01, -3.0200e-02, -1.5200e-01,
        -2.3300e-02,  2.1290e-01, -1.2400e-02, -5.9100e-02,  4.3200e-02,
        -2.9000e-03, -6.3700e-02,  8.1700e-02, -5.1700e-02,  5.1900e-02,
         4.9900e-02, -1.5120e-01, -1.5300e-02, -5.8800e-02, -3.3890e-01,
         3.1600e-02,  2.5000e-03,  1.7000e-02,  2.0200e-01,  2.9000e-02,
        -2.1000e-03, -2.6000e-03,  5.3000e-02,  1.3900e-02,  1.2660e-01,
         5.7500e-02, -2.5300e-02, -7.8000e-02, -1.8300e-02, -1.4100e-01,
        -8.2000e-03,  4.2100e-02, -5.5000e-03, -1.9000e-03, -7.8200e-02,
         2.3600e-02,  3.4040e-01, -1.3570e-01, -9.4500e-02, -2.3200e-02,
         4.2600e-02,  5.9800e-02,  2.1380e-01,  1.0600e-02, -8.6500e-02,
         2.4990e-01,  2.7580e-01,  1.0400e-01,  1.2040e-01, -1.4020e-01,
        -1.0300e-02, -2.1500e-01,  2.8000e-03,  1.2

In [22]:
word2embedding['the']

tensor([ 2.3100e-02,  1.7000e-02,  1.5700e-02, -7.7300e-02,  1.0880e-01,
         3.1000e-03, -1.4870e-01, -2.6720e-01, -3.5700e-02, -4.8700e-02,
         8.0700e-02,  1.5320e-01, -7.3900e-02, -2.9100e-02, -4.4500e-02,
        -1.4000e-03,  1.0140e-01,  1.8600e-02, -2.5300e-02,  2.0000e-02,
        -2.6000e-03, -1.7900e-02,  5.0000e-04,  5.4000e-03, -1.3400e-02,
         2.3300e-02, -7.5500e-02, -1.5600e-02,  4.1500e-02, -4.9850e-01,
         4.1000e-02, -6.1600e-02,  4.7000e-03,  3.2500e-02, -1.6200e-02,
        -1.7200e-02,  9.8800e-02,  7.6600e-02, -7.9600e-02, -3.4500e-02,
         1.2400e-02, -1.0070e-01, -2.9200e-02, -7.6200e-02, -1.2610e-01,
        -5.3100e-02,  4.2400e-02,  1.4400e-02, -6.8300e-02,  2.8590e-01,
         3.9900e-02,  2.0100e-02,  3.2400e-01, -6.5600e-02, -4.9700e-02,
         9.0000e-03,  9.0200e-02, -1.3800e-02, -4.1200e-02, -2.9700e-02,
         3.1390e-01, -1.4280e-01,  1.6600e-02, -2.1900e-02, -5.7500e-02,
         1.3590e-01, -1.6550e-01,  1.9000e-03,  3.2

In [23]:
# Looking at data type of embedding to be sure
word2embedding['the'].dtype

torch.float32

Checking if we have word embeddings for individual letters and symbols

In [24]:
import string

list_of_all = [x for x in string.punctuation] + [x for x in string.digits] + [x for x in string.ascii_letters]
not_present = list()
for item in list_of_all:
    if item not in vocab:
        not_present.append(item)

print(f"We have following individual characters:\n{list_of_all}\nfrom which only characters {not_present} donot have embeddings in word2embedding.")

We have following individual characters:
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
from which only characters ['_', '`'] donot have embeddings in word2embedding.


## Preparing data for training

In [25]:
# Make device agnostic code
device = "cuda:5" if torch.cuda.is_available() else "cpu"
print(device)

cuda:5


In [26]:
window_size = 1
X = list()
y = list()

null_token = "[NULL_TOKEN]"
for sentence, pos_tags in zip(sentences, pos_tags):
    for i in range(len(sentence)):
        y.append(pos_tags[i])
        padded_sentence = [null_token] * window_size + sentence + [null_token] * window_size
        X.append(padded_sentence[i : i + 2*window_size + 1])

In [27]:
X[:5], y[:5]

([['[NULL_TOKEN]', 'The', 'Fulton'],
  ['The', 'Fulton', 'County'],
  ['Fulton', 'County', 'Grand'],
  ['County', 'Grand', 'Jury'],
  ['Grand', 'Jury', 'said']],
 ['DET', 'NOUN', 'NOUN', 'ADJ', 'NOUN'])

Setting `[NULL_TOKEN]` embeddings as all zeros

In [28]:
word2embedding[null_token] = torch.zeros_like(word2embedding['The'])

In [29]:
word2embedding[null_token], word2embedding[null_token].dtype

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

### Code for handling embeddings of words not in the word embeddings file

In [30]:
import re

def decontracted(phrase):
    # general
    phrase = re.sub(r"n\'t", "", phrase)
    phrase = re.sub(r"\'re", "", phrase)
    phrase = re.sub(r"\'s", "", phrase)
    phrase = re.sub(r"\'d", "", phrase)
    phrase = re.sub(r"\'ll", "", phrase)
    phrase = re.sub(r"\'t", "", phrase)
    phrase = re.sub(r"\'ve", "", phrase)
    phrase = re.sub(r"\'m", "", phrase)
    phrase = re.sub(r"\'$", "", phrase)
    return phrase


test = "Hey I'm Yann, how're you and how's it going ? That's interesting: I'd love to hear more about it."
print(decontracted(test))

Hey I Yann, how you and how it going ? That interesting: I love to hear more about it.


In [31]:
word2embedding["``"] = word2embedding["'"]
word2embedding["''"] = word2embedding["'"]
word2embedding[not_present[0]] = word2embedding["-"]

In [32]:
import re
# Define a regular expression pattern
# Pattern to check if a number only has digits, dollar signs, and commas.

non_alphabet_pattern = r'^[^a-zA-Z]+$'
non_alphabet_word_list = []
for word in list(word2embedding.keys()):
    if re.match(non_alphabet_pattern, word):
        # print(word)
        non_alphabet_word_list.append(word)

word2embedding["[NON_ALPHABET]"] = torch.mean(
    torch.stack([word2embedding[num] for num in non_alphabet_word_list]), 
    dim=0
    )

In [33]:
non_alphabet_word_list

[',',
 '.',
 ')',
 ':',
 '"',
 '(',
 '!',
 '?',
 '/',
 '-',
 '”',
 '“',
 '...',
 ';',
 "'",
 '–',
 '1',
 '2',
 '’',
 '&',
 '3',
 '…',
 '—',
 ']',
 '|',
 '4',
 '[',
 '5',
 '10',
 '*',
 '‘',
 '6',
 '#',
 '..',
 '8',
 '$',
 '7',
 '12',
 '2012',
 '--',
 '2013',
 '2017',
 '+',
 '11',
 '2016',
 '@',
 '>',
 '20',
 '....',
 '2011',
 '2014',
 '15',
 '9',
 '2015',
 '=',
 '2010',
 '16',
 '13',
 '14',
 '•',
 '30',
 '18',
 '2009',
 '17',
 '2008',
 '%',
 '»',
 '0',
 '25',
 '21',
 '22',
 '19',
 '24',
 '2007',
 '23',
 '~',
 '50',
 '2006',
 '26',
 '28',
 '100',
 '27',
 '2005',
 '<',
 '29',
 '40',
 '·',
 '2004',
 '.....',
 '31',
 '}',
 '�',
 '\\',
 '{',
 '2003',
 '2000',
 '£',
 '«',
 '→',
 '^',
 '60',
 '32',
 '2002',
 '®',
 '35',
 '2001',
 '01',
 '07',
 '02',
 '🙂',
 '08',
 '09',
 '06',
 '......',
 '03',
 '05',
 '04',
 '45',
 '80',
 '36',
 '200',
 '←',
 '34',
 '33',
 '70',
 '€',
 '™',
 '1999',
 '42',
 '90',
 '500',
 '38',
 '48',
 '1998',
 '37',
 '44',
 '©',
 '55',
 '300',
 '39',
 '1997',
 '41',
 '1996',


In [34]:
word2embedding["[NON_ALPHABET]"], word2embedding["[NON_ALPHABET]"].shape

(tensor([-2.3801e-02,  1.8670e-01,  1.9960e-01,  7.0248e-03, -8.0534e-03,
         -5.3447e-02,  1.4042e-01, -1.7458e-01,  7.8274e-02,  1.5561e-01,
         -4.5317e-02,  7.2526e-01, -1.1935e-01, -5.1581e-02,  2.1298e-03,
         -2.8344e-02, -9.0454e-02, -6.5800e-02, -1.1985e-01, -4.8639e-02,
         -3.0983e-02,  7.6132e-03,  9.0279e-02,  3.6764e-02,  1.9740e-02,
          6.0368e-02,  3.1835e-02,  5.4218e-02,  8.5114e-02, -1.4278e-01,
          4.6164e-02,  1.1858e-01, -1.2316e-01,  1.9545e-01,  2.7163e-02,
          9.2470e-02, -5.0190e-02, -3.2961e-03,  1.9826e-01, -1.3013e-01,
         -2.2385e-03,  2.2055e-01, -2.1592e-02, -1.2384e-01, -9.1814e-02,
          9.0625e-02, -9.9048e-02,  1.1771e-01, -9.5122e-02,  2.1546e-01,
          1.0088e-01,  3.1869e-01,  8.8194e-01, -1.4286e-02,  6.7230e-03,
         -6.0228e-02, -3.5785e-02,  1.1145e-01,  1.1655e-01, -7.1183e-02,
         -3.9561e-02,  3.9254e-01,  1.1663e-01,  1.3539e-02,  1.9445e-02,
         -1.0353e-01,  5.9360e-02, -4.

A sample of how we'll use subwords seperated by puctuations to find embeddings of large compund words, not present in `vocab`

In [35]:
letters = list("ex-Mrs.")
punctuations = [x for x in string.punctuation]
positions = [i for i, x in enumerate(letters) if x in punctuations]

print(letters)
print(positions)

sub_words = []
for i, j in zip([0] + positions, positions + [None]):
    if i==0:
        sub_words.append("".join(letters[i:j]))
    else:
        sub_words.append(letters[i])
        sub_words.append("".join(letters[i+1:j]))
    
while "" in sub_words:
    sub_words.remove("")
print(sub_words)

sub_words_embeddings = []
for sub_word in sub_words:
    sub_words_embeddings.append(word2embedding[sub_word])

print(torch.mean(torch.stack(sub_words_embeddings), dim=0))

['e', 'x', '-', 'M', 'r', 's', '.']
[2, 6]
['ex', '-', 'Mrs', '.']
tensor([ 7.9575e-02, -3.7300e-02, -2.1530e-01, -1.4200e-02, -1.6500e-03,
        -1.2237e-01,  1.7050e-01, -4.2400e-02, -5.5150e-02, -7.7375e-02,
        -6.5125e-02,  2.7000e-01, -2.2225e-02, -2.6000e-03, -4.0350e-02,
        -5.8175e-02, -2.6825e-02,  1.0875e-02, -8.1950e-02,  9.5975e-02,
         3.1750e-03,  2.5700e-02, -9.6450e-02, -1.6780e-01, -2.9275e-02,
        -3.0750e-03,  5.5250e-03,  7.0350e-02,  9.0525e-02,  1.4958e-01,
         3.6800e-02, -3.3500e-02, -1.0813e-01,  1.6862e-01, -1.0500e-01,
        -3.1500e-02, -5.7125e-02,  0.0000e+00, -3.2725e-02, -6.4325e-02,
        -9.5225e-02, -3.7150e-02,  1.3418e-01,  1.0030e-01,  1.9827e-01,
         3.7025e-02, -1.5350e-02, -7.5175e-02, -6.0300e-02, -2.5732e-01,
        -5.3400e-02,  7.7875e-02,  2.9350e-02, -2.5450e-02,  1.7875e-02,
        -9.3400e-02, -6.8050e-02,  2.2273e-01,  6.0375e-02,  2.5000e-04,
        -7.0575e-02,  1.9107e-01,  4.5425e-02, -7.5250e-0

In [36]:
import string
punctuations = [x for x in string.punctuation]

word2embedding["[UNK]"] = torch.mean(torch.stack([embedding for _, embedding in word2embedding.items()]), dim=0)

set_of_vocab = set(word2embedding.keys())

def get_embeddings(word, word2embedding, words_not_present):
    if word in word2embedding.keys():
        return word2embedding[word]
    elif re.match(non_alphabet_pattern, word):
        return word2embedding["[NON_ALPHABET]"]
    elif decontracted(word) in word2embedding.keys():
        return word2embedding[decontracted(word)]
    elif decontracted(word).lower() in word2embedding.keys():
        return word2embedding[decontracted(word).lower()]
    else:
        letters = list(word)
        positions = [i for i, x in enumerate(letters) if x in punctuations]

        sub_words = []
        for i, j in zip([0] + positions, positions + [None]):
            if i==0:
                sub_words.append("".join(letters[i:j]))
            else:
                sub_words.append(letters[i])
                sub_words.append("".join(letters[i+1:j]))
            
        while "" in sub_words:
            sub_words.remove("")
        
        if set(sub_words).issubset(set_of_vocab):
            return torch.mean(torch.stack([get_embeddings(sub_word, word2embedding, words_not_present) for sub_word in sub_words]), dim=0)
        else:
            words_not_present.append(word)
            return word2embedding["[UNK]"]
            # words_embeddings.append(word2embedding[word])

In [37]:
words_not_present = list()

X_embeddings = list()
for words in tqdm(X):
    words_embeddings = list()
    for word in words:
        words_embeddings.append(get_embeddings(word, word2embedding, words_not_present))
    X_embeddings.append(torch.stack(words_embeddings))

X_embeddings = torch.stack(X_embeddings)

100%|██████████| 1161192/1161192 [00:11<00:00, 102332.93it/s]


In [38]:
with open("words_not_present.txt", "w") as f:
    for word in words_not_present:
        f.write(f"{word}\n")

In [39]:
len(words_not_present)

7165

In [40]:
len(set(words_not_present))

1703

In [41]:
vocab = list(word2embedding.keys())
len(vocab), len(set(vocab))

(2000001, 2000001)

A dictionary to convert pos tags to ids

In [42]:
pos_tag_to_id = dict()
for i, tag in enumerate(sorted(universal_tags)):
    pos_tag_to_id[tag] = i

In [43]:
pos_tag_to_id

{'.': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'CONJ': 4,
 'DET': 5,
 'NOUN': 6,
 'NUM': 7,
 'PRON': 8,
 'PRT': 9,
 'VERB': 10,
 'X': 11}

In [44]:
len(X_embeddings), len(y)

(1161192, 1161192)

Actual conversion of labels (pos tags) to ids below:

In [45]:
for i in range(len(y)):
    y[i] = torch.tensor(pos_tag_to_id[y[i]])

In [46]:
y = torch.stack(y)

In [47]:
y.dtype

torch.int64

In [48]:
y[0].dtype

torch.int64

Test-train split

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, 
                                                    y,
                                                    test_size = 0.1,
                                                    random_state =42)

In [50]:
len(X_train), len(X_test), len(y_train), len(y_test)

(1045072, 116120, 1045072, 116120)

In [51]:
type(X_train), type(X_test), type(y_train), type(y_test)

(torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)

### Final leg of data preparation

In [52]:
type(X_train), len(X_train), type(X_train[0]), len(X_train[0]), type(X_train[0][0]), len(X_train[0][0])

(torch.Tensor, 1045072, torch.Tensor, 3, torch.Tensor, 300)

`X_train` is of dimension `num_of_data_points` x `window_size` x `embedding_size`

In [53]:
# Set the number of epochs
epochs = 3

# Put the data on the target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

## Training loop

In [54]:
# Construct a model that subclasses NN.Module

class POS_tagger(nn.Module):
    def __init__(self, embedding_size, window_size, hidden_layer_size, num_tags, device):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.device = device
        self.layers_embeddigs_to_hidden = nn.ModuleList([nn.Linear(in_features=embedding_size, out_features=hidden_layer_size, bias=True) for _ in range(2*window_size + 1)])
        self.activation = nn.ReLU()
        self.layer_hidden_to_output = nn.Linear(in_features=hidden_layer_size, out_features=num_tags, bias=True)
        # self.output_probabilities = nn.Softmax(dim=1)
        
        
        
    # Define a `forward()` method that outlines the forward pass (or forward computation) of the model
    def forward(self, batched_embeddings):
        inputs_to_hidden = torch.stack([
            torch.stack([layer(embedding) for embedding, layer in zip(embeddings, self.layers_embeddigs_to_hidden)])
            for embeddings in batched_embeddings
            ]).to(self.device)
        return self.layer_hidden_to_output(self.activation(torch.mean(inputs_to_hidden, dim=1)))
        # return self.output_probabilities(self.layer_embeddigs_to_word(self.layer_context_to_embeddings(context_embeddings_average)))            # x -> layer_1 -> layer_2 -> output

In [55]:
pos_tagger = POS_tagger(embedding_size=300, window_size=1, hidden_layer_size=128, num_tags=len(universal_tags), device=device).to(device)

In [56]:
device

'cuda:5'

In [57]:
# for parameter in pos_tagger.layers_embeddigs_to_hidden[0].parameters():
#     print(parameter)

In [58]:
print(pos_tagger)

POS_tagger(
  (layers_embeddigs_to_hidden): ModuleList(
    (0): Linear(in_features=300, out_features=128, bias=True)
    (1): Linear(in_features=300, out_features=128, bias=True)
    (2): Linear(in_features=300, out_features=128, bias=True)
  )
  (activation): ReLU()
  (layer_hidden_to_output): Linear(in_features=128, out_features=12, bias=True)
)


In [59]:
pos_tagger.state_dict()

OrderedDict([('layers_embeddigs_to_hidden.0.weight',
              tensor([[ 0.0441,  0.0479, -0.0135,  ..., -0.0045, -0.0192, -0.0187],
                      [ 0.0019, -0.0122, -0.0199,  ..., -0.0508,  0.0403,  0.0067],
                      [-0.0311,  0.0302, -0.0546,  ..., -0.0237, -0.0143, -0.0223],
                      ...,
                      [ 0.0527, -0.0576,  0.0222,  ..., -0.0255, -0.0458,  0.0570],
                      [-0.0353,  0.0326,  0.0238,  ...,  0.0176, -0.0237,  0.0533],
                      [-0.0546,  0.0252, -0.0439,  ..., -0.0310,  0.0087,  0.0520]],
                     device='cuda:5')),
             ('layers_embeddigs_to_hidden.0.bias',
              tensor([ 0.0218, -0.0260, -0.0268,  0.0015,  0.0040,  0.0442, -0.0430, -0.0533,
                       0.0353, -0.0284, -0.0435,  0.0147, -0.0560,  0.0334, -0.0252,  0.0453,
                      -0.0549, -0.0550, -0.0233,  0.0488, -0.0500, -0.0354,  0.0525,  0.0450,
                       0.0050, -0.0485,  0

Training loop setup

In [60]:
# Create a loss function
loss_fn = nn.CrossEntropyLoss() # CrossEntropyLoss -> sigmoid activation is built-in

# Create an optimizer
optimizer = torch.optim.Adam(params=pos_tagger.parameters(),
                            lr = 1e-3)

In [61]:
torch.cuda.memory_allocated()

0

In [62]:
torch.cuda.max_memory_allocated()

0

In [63]:
# Calculate accuracy
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()      # torch.eq(y_true, y_pred) checks how many of y_true are actually equal to y_pred
    accuracy = correct / len(y_pred) * 100
    return accuracy

In [64]:
# torch.tensor(torch.nn.Softmax(y_logits))

In [65]:
batch_size = 256
# Building the training and evaluation loop
for epoch in range(epochs):
    epoch_loss = 0.0
    ## Training
    pos_tagger.train()
    with tqdm(np.random.permutation(range(0, len(X_train), batch_size))) as progress_bar:
        for i in progress_bar:
            # 1. Forward pass
            y_logits = pos_tagger(X_train[i:i+batch_size]).squeeze()      # squeeze removes an extra one dimension from a tensor
            # y_pred = torch.argmax(torch.nn.Softmax(y_logits), dim=1)   # Can't make it work with sotmax for some reason
            y_pred = torch.argmax(y_logits, dim=1)
        
            # 2. Calculate loss / accuracy
            loss = loss_fn(y_logits,        # nn.CrossEntropyLoss expects raw logits as input
                        y_train[i:i+batch_size])
        
            epoch_loss += loss
            
            acc = accuracy_fn(y_true=y_train[i:i+batch_size],
                            y_pred=y_pred)
        
            # 3. Optimizer zero grad
            optimizer.zero_grad()
        
            # 4. Loss backward (back propagation)
            loss.backward()
        
            # 5. Optimizer step (gradient descent)
            optimizer.step()
            
            progress_bar.set_postfix(batch_loss=loss.item())
        
        
        ### Testing
        pos_tagger.eval()
        with torch.inference_mode():
            # 1. Forward pass
            test_logits = pos_tagger(X_test).squeeze()      # squeeze removes an extra one dimension from a tensor
            test_pred = torch.argmax(test_logits, dim=1)
            
            # 2. Calculate loss / accuracy
            test_loss = loss_fn(test_logits,        # nn.CrossEntropyLoss expects raw logits as input
                                y_test)
            
            test_acc = accuracy_fn(y_true=y_test,
                            y_pred=test_pred)
            
        # if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Acc: {acc:.2f}% | Test loss: {test_loss:.5f} | Test acc: {test_acc:.2f}%")

100%|██████████| 4083/4083 [05:55<00:00, 11.48it/s, batch_loss=0.0624]


Epoch: 0 | Loss: 0.06243 | Acc: 97.27% | Test loss: 0.08928 | Test acc: 97.17%


100%|██████████| 4083/4083 [06:08<00:00, 11.08it/s, batch_loss=0.0812]


Epoch: 1 | Loss: 0.08118 | Acc: 97.27% | Test loss: 0.07083 | Test acc: 97.71%


100%|██████████| 4083/4083 [05:40<00:00, 11.98it/s, batch_loss=0.0574]


Epoch: 2 | Loss: 0.05736 | Acc: 97.27% | Test loss: 0.06310 | Test acc: 97.98%


## Code for Demo of POS Tagging

In [66]:
id_to_pos_tag = {id: tag for tag, id in pos_tag_to_id.items()}
id_to_pos_tag

{0: '.',
 1: 'ADJ',
 2: 'ADP',
 3: 'ADV',
 4: 'CONJ',
 5: 'DET',
 6: 'NOUN',
 7: 'NUM',
 8: 'PRON',
 9: 'PRT',
 10: 'VERB',
 11: 'X'}

In [67]:
from nltk.tokenize import word_tokenize

In [70]:
def get_pos_tags(sentence):
    tokens = word_tokenize(sentence)
    test = list()

    null_token = "[NULL_TOKEN]"
    for i in range(len(tokens)):
        padded_sentence = [null_token] * window_size + tokens + [null_token] * window_size
        test.append(padded_sentence[i : i + 2*window_size + 1])
    
    test_embeddings = list()
    words_not_present = list()

    for words in test:
        words_embeddings = list()
        for word in words:
            words_embeddings.append(get_embeddings(word, word2embedding, words_not_present))
        test_embeddings.append(torch.stack(words_embeddings))

    test_embeddings = torch.stack(test_embeddings)
    
    pos_tagger.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = pos_tagger(test_embeddings.to(device)).squeeze()      # squeeze removes an extra one dimension from a tensor
        test_pred = torch.argmax(test_logits, dim=1)
        
    test_pred = [id_to_pos_tag[id] for id in test_pred.cpu().detach().numpy()]
    
    df_test = pd.DataFrame.from_dict({
        "tokens": tokens,
        "POS Tags": test_pred
    })
    
    display(HTML(df_test.to_html()))

Sample input sentence:

Good muffins cost $3.88 in New York. Please buy me ... two od them. Thanks.

In [68]:
sentence = input("Please enter sentence to perform POS Tagging:\n")

In [71]:
get_pos_tags(sentence)

Unnamed: 0,tokens,POS Tags
0,Good,ADJ
1,muffins,NOUN
2,cost,VERB
3,$,NOUN
4,3.88,NUM
5,in,ADP
6,New,ADJ
7,York,NOUN
8,.,.
9,Please,VERB


In [None]:
get_pos_tags(sentence)

In [67]:
word_tokenize(sentence)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 '...',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [108]:
tokens = word_tokenize(sentence)
window_size = 1
test = list()

null_token = "[NULL_TOKEN]"
for i in range(len(tokens)):
    padded_sentence = [null_token] * window_size + tokens + [null_token] * window_size
    test.append(padded_sentence[i : i + 2*window_size + 1])
    
test

[['[NULL_TOKEN]', 'Good', 'muffins'],
 ['Good', 'muffins', 'cost'],
 ['muffins', 'cost', '$'],
 ['cost', '$', '3.88'],
 ['$', '3.88', 'in'],
 ['3.88', 'in', 'New'],
 ['in', 'New', 'York'],
 ['New', 'York', '.'],
 ['York', '.', 'Please'],
 ['.', 'Please', 'buy'],
 ['Please', 'buy', 'me'],
 ['buy', 'me', '...'],
 ['me', '...', 'two'],
 ['...', 'two', 'of'],
 ['two', 'of', 'them'],
 ['of', 'them', '.'],
 ['them', '.', 'Thanks'],
 ['.', 'Thanks', '.'],
 ['Thanks', '.', '[NULL_TOKEN]']]

In [86]:
len(words_embeddings)

3

In [88]:
test_embeddings = list()
words_not_present = list()

for words in test:
    words_embeddings = list()
    for word in words:
        words_embeddings.append(get_embeddings(word, word2embedding, words_not_present))
    test_embeddings.append(torch.stack(words_embeddings))

test_embeddings = torch.stack(test_embeddings)

In [89]:
len(test_embeddings), type(test_embeddings), len(test_embeddings[0]), type(test_embeddings[0])

(19, torch.Tensor, 3, torch.Tensor)

In [90]:
test_embeddings.shape

torch.Size([19, 3, 300])

In [92]:
pos_tagger.eval()
with torch.inference_mode():
    # 1. Forward pass
    test_logits = pos_tagger(test_embeddings.to(device)).squeeze()      # squeeze removes an extra one dimension from a tensor
    test_pred = torch.argmax(torch.sigmoid(test_logits), dim=1)

In [93]:
test_pred

tensor([ 1,  6, 10,  6,  7,  2,  1,  6,  0,  9, 10,  8,  0,  7,  2,  8,  0,  6,
         0], device='cuda:5')

In [97]:
id_to_pos_tag = {id: tag for tag, id in pos_tag_to_id.items()}
id_to_pos_tag

{0: '.',
 1: 'ADJ',
 2: 'ADP',
 3: 'ADV',
 4: 'CONJ',
 5: 'DET',
 6: 'NOUN',
 7: 'NUM',
 8: 'PRON',
 9: 'PRT',
 10: 'VERB',
 11: 'X'}

In [99]:
test_pred

tensor([ 1,  6, 10,  6,  7,  2,  1,  6,  0,  9, 10,  8,  0,  7,  2,  8,  0,  6,
         0], device='cuda:5')

In [102]:
test_pred = [id_to_pos_tag[id] for id in test_pred.cpu().detach().numpy()]
test_pred

['ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'NUM',
 'ADP',
 'ADJ',
 'NOUN',
 '.',
 'PRT',
 'VERB',
 'PRON',
 '.',
 'NUM',
 'ADP',
 'PRON',
 '.',
 'NOUN',
 '.']

In [109]:
len(tokens), len(test_pred)

(19, 19)

In [110]:
df_test = pd.DataFrame.from_dict({
    "tokens": tokens,
    "POS Tags": test_pred
})
display(HTML(df_test.to_html()))

Unnamed: 0,tokens,POS Tags
0,Good,ADJ
1,muffins,NOUN
2,cost,VERB
3,$,NOUN
4,3.88,NUM
5,in,ADP
6,New,ADJ
7,York,NOUN
8,.,.
9,Please,PRT
