# Discrete Word Representations

In [1]:
## Based on material at:
# - https://github.com/biplav-s/course-nl/blob/master/l6-probabilistic/Explorations%20in%20Vector%20Representation.ipynb
# - https://github.com/biplav-s/course-nl/blob/master/l7-language/code/Context%20with%20TF-IDF.ipynb

In [2]:
# A function that will encode a-z, space and '.'
def do_integer_encoding(data):
    # define universe of possible input values
    alphabet = 'abcdefghijklmnopqrstuvwxyz .'
    
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    
    # integer encoded input data
    integer_encoded = [char_to_int[char] for char in data]
    char_decoded = [int_to_char[integ] for integ in integer_encoded]
    
    return integer_encoded, char_decoded

In [3]:
# See with a sample string
small_data = "hello world"
enc, dec = do_integer_encoding(small_data)
print ("data = " + small_data + ", enc = " + str(enc))
print ("decoded data: " +  "dec = " + "".join(dec))

data = hello world, enc = [7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]
decoded data: dec = hello world


In [4]:
# See with a sample string with out of scope alphabets. Gives error.
small_data = "hello world 12"
enc = do_integer_encoding(small_data)
print ("data = " + small_data + ", enc = " + str(enc))

KeyError: '1'

In [5]:
# Will loose information if we force with out-of-scope alphabets.
# Example - lowecases and uppercase
medium_data = "This is an important document. It contains the contract governing \
your deposit relationship with the Bank and required legal \
disclosures. Please have it translated. "

enc, dec = do_integer_encoding(medium_data.lower())

print ("data = " + medium_data + "\n enc = " + str(enc))
print ("decoded data: " +  "dec = " + "".join(dec))

data = This is an important document. It contains the contract governing your deposit relationship with the Bank and required legal disclosures. Please have it translated. 
 enc = [19, 7, 8, 18, 26, 8, 18, 26, 0, 13, 26, 8, 12, 15, 14, 17, 19, 0, 13, 19, 26, 3, 14, 2, 20, 12, 4, 13, 19, 27, 26, 8, 19, 26, 2, 14, 13, 19, 0, 8, 13, 18, 26, 19, 7, 4, 26, 2, 14, 13, 19, 17, 0, 2, 19, 26, 6, 14, 21, 4, 17, 13, 8, 13, 6, 26, 24, 14, 20, 17, 26, 3, 4, 15, 14, 18, 8, 19, 26, 17, 4, 11, 0, 19, 8, 14, 13, 18, 7, 8, 15, 26, 22, 8, 19, 7, 26, 19, 7, 4, 26, 1, 0, 13, 10, 26, 0, 13, 3, 26, 17, 4, 16, 20, 8, 17, 4, 3, 26, 11, 4, 6, 0, 11, 26, 3, 8, 18, 2, 11, 14, 18, 20, 17, 4, 18, 27, 26, 15, 11, 4, 0, 18, 4, 26, 7, 0, 21, 4, 26, 8, 19, 26, 19, 17, 0, 13, 18, 11, 0, 19, 4, 3, 27, 26]
decoded data: dec = this is an important document. it contains the contract governing your deposit relationship with the bank and required legal disclosures. please have it translated. 


In [6]:
# Scikit has support for label encoding
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder

# But need an array of words as input

In [7]:
# We use spacy to tokenize
# Import more
import spacy
nlp = spacy.load('en_core_web_sm')

import numpy as np

In [8]:
# Tokenize larger data for ease

doc = nlp(medium_data)
values = np.array([token.text for token in doc])
print (values)

['This' 'is' 'an' 'important' 'document' '.' 'It' 'contains' 'the'
 'contract' 'governing' 'your' 'deposit' 'relationship' 'with' 'the'
 'Bank' 'and' 'required' 'legal' 'disclosures' '.' 'Please' 'have' 'it'
 'translated' '.']


In [9]:
# Now we can ask for encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

[ 4 15  5 14 11  0  2  7 20  8 12 23  9 18 22 20  1  6 19 17 10  0  3 13
 16 21  0]


In [10]:
# And decode
inverted = label_encoder.inverse_transform(integer_encoded)
print(inverted)

['This' 'is' 'an' 'important' 'document' '.' 'It' 'contains' 'the'
 'contract' 'governing' 'your' 'deposit' 'relationship' 'with' 'the'
 'Bank' 'and' 'required' 'legal' 'disclosures' '.' 'Please' 'have' 'it'
 'translated' '.']


In [11]:
# And recreating original string is trivial
print (" ".join(inverted))

This is an important document . It contains the contract governing your deposit relationship with the Bank and required legal disclosures . Please have it translated .


# Contextual Word Representation

In [12]:
# Based on sci-kit documentation
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
corpus = [
'An alpha document.',
'A beta document.',
'Guten Morgen!',
'Gamma manuscript is old.',
'Whither my document?',
]

In [14]:
# Single word representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

# Notice document has one dimension although 3 occurences

['alpha', 'an', 'beta', 'document', 'gamma', 'guten', 'is', 'manuscript', 'morgen', 'my', 'old', 'whither']
[[1 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0]
 [0 0 0 0 1 0 1 1 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 1 0 1]]


In [15]:
# N-gram representation (2- and 3-; word based)

vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 3))
X2 = vectorizer2.fit_transform(corpus)

print(vectorizer2.get_feature_names())
print(X2.toarray())

['alpha document', 'an alpha', 'an alpha document', 'beta document', 'gamma manuscript', 'gamma manuscript is', 'guten morgen', 'is old', 'manuscript is', 'manuscript is old', 'my document', 'whither my', 'whither my document']
[[1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1]]


In [16]:
# N-gram representation (2- and 3-; char based)
vectorizer3 = CountVectorizer(analyzer='char', ngram_range=(2,2))
X3 = vectorizer3.fit_transform(corpus)

print(vectorizer3.get_feature_names())
print(X3.toarray())

[' a', ' b', ' d', ' i', ' m', ' o', 'a ', 'al', 'am', 'an', 'be', 'cr', 'cu', 'd.', 'do', 'en', 'er', 'et', 'ga', 'ge', 'gu', 'ha', 'he', 'hi', 'ip', 'is', 'it', 'ld', 'lp', 'ma', 'me', 'mm', 'mo', 'my', 'n ', 'n!', 'nt', 'nu', 'oc', 'ol', 'or', 'ph', 'pt', 'r ', 'rg', 'ri', 's ', 'sc', 't ', 't.', 't?', 'ta', 'te', 'th', 'um', 'us', 'ut', 'wh', 'y ']
[[1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0
  1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0]
 [0 1 1 0 0 0 2 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1
  0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 2 0 1 0 0 0 0
  0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0
  1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1]]


# Contextual Representation Using IDF

In [17]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
# TFIDR Vectorizer gives value based on Inverse Document Frequency, i.e., relative
# occurence of words in the documents. Hence, context is by word frequency.

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

['alpha', 'an', 'beta', 'document', 'gamma', 'guten', 'is', 'manuscript', 'morgen', 'my', 'old', 'whither']
[[0.63907044 0.63907044 0.         0.42799292 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.83088075 0.55645052 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.         0.70710678 0.         0.         0.        ]
 [0.         0.         0.         0.         0.5        0.
  0.5        0.5        0.         0.         0.5        0.        ]
 [0.         0.         0.         0.42799292 0.         0.
  0.         0.         0.         0.63907044 0.         0.63907044]]


In [19]:
# We can use relative word occurence (similarity) to measure similarity between documents
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
for i in range(1, len(corpus)):
    print ("similarity of doc-1 (" + 
           corpus[0] + ") with doc-" + 
           str(i+1) + " (" + corpus[i] + ") is = "  + 
           str(cosine_similarity (X[0], X[i])))

similarity of doc-1 (An alpha document.) with doc-2 (A beta document.) is = [[0.23815688]]
similarity of doc-1 (An alpha document.) with doc-3 (Guten Morgen!) is = [[0.]]
similarity of doc-1 (An alpha document.) with doc-4 (Gamma manuscript is old.) is = [[0.]]
similarity of doc-1 (An alpha document.) with doc-5 (Whither my document?) is = [[0.18317794]]
