# Contextual Word Representation

In [1]:
# Based on sci-kit documentation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# String (data) to experiment with - from water - 
# -- https://www.epa.gov/sdwa/drinking-water-regulations-and-contaminants
data = """
National Secondary Drinking Water Regulations (NSDWRs)
NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water. EPA recommends secondary standards to water systems but does not require systems to comply with the standard. However, states may choose to adopt them as enforceable standards.
"""
print (data)


National Secondary Drinking Water Regulations (NSDWRs)
NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water. EPA recommends secondary standards to water systems but does not require systems to comply with the standard. However, states may choose to adopt them as enforceable standards.



In [3]:
# Import package
import re

In [4]:
# We get it by getting words from split; a hack
pattern = "[\n.]"
corpus = re.split(pattern, data)
print (corpus)

['', 'National Secondary Drinking Water Regulations (NSDWRs)', 'NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water', ' EPA recommends secondary standards to water systems but does not require systems to comply with the standard', ' However, states may choose to adopt them as enforceable standards', '', '']


In [5]:
# Removing empty strings
corpus = [i for i in corpus if i]
print (corpus)

['National Secondary Drinking Water Regulations (NSDWRs)', 'NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water', ' EPA recommends secondary standards to water systems but does not require systems to comply with the standard', ' However, states may choose to adopt them as enforceable standards']


In [6]:
# Single word representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

# Notice document has one dimension although 3 occurences

['adopt', 'aesthetic', 'are', 'as', 'but', 'cause', 'choose', 'color', 'comply', 'contaminants', 'cosmetic', 'discoloration', 'does', 'drinking', 'effects', 'enforceable', 'epa', 'guidelines', 'however', 'in', 'may', 'national', 'non', 'not', 'nsdwrs', 'odor', 'or', 'recommends', 'regulating', 'regulations', 'require', 'secondary', 'skin', 'standard', 'standards', 'states', 'such', 'systems', 'taste', 'that', 'the', 'them', 'to', 'tooth', 'water', 'with']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 1 0]
 [0 1 1 2 0 1 0 1 0 1 1 1 0 1 2 1 0 1 0 1 1 0 1 0 1 1 4 0 1 0 0 1 1 0 1 0
  2 0 1 1 0 0 0 1 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0
  0 2 0 0 1 0 2 0 1 1]
 [1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
  0 0 0 0 0 1 1 0 0 0]]


In [7]:
#N-gram representation (2- and 3-; word based)

vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 3))
X2 = vectorizer2.fit_transform(corpus)

print(vectorizer2.get_feature_names())
print(X2.toarray())

['adopt them', 'adopt them as', 'aesthetic effects', 'aesthetic effects such', 'are non', 'are non enforceable', 'as enforceable', 'as enforceable standards', 'as skin', 'as skin or', 'as taste', 'as taste odor', 'but does', 'but does not', 'cause cosmetic', 'cause cosmetic effects', 'choose to', 'choose to adopt', 'color in', 'color in drinking', 'comply with', 'comply with the', 'contaminants that', 'contaminants that may', 'cosmetic effects', 'cosmetic effects such', 'discoloration or', 'discoloration or aesthetic', 'does not', 'does not require', 'drinking water', 'drinking water regulations', 'effects such', 'effects such as', 'enforceable guidelines', 'enforceable guidelines regulating', 'enforceable standards', 'epa recommends', 'epa recommends secondary', 'guidelines regulating', 'guidelines regulating contaminants', 'however states', 'however states may', 'in drinking', 'in drinking water', 'may cause', 'may cause cosmetic', 'may choose', 'may choose to', 'national secondary',

# Using TF-IDF

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
# TFIDR Vectorizer gives value based on Inverse Document Frequency, i.e., relative
# occurence of words in the documents. Hence, context is by word frequency.

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

['adopt', 'aesthetic', 'are', 'as', 'but', 'cause', 'choose', 'color', 'comply', 'contaminants', 'cosmetic', 'discoloration', 'does', 'drinking', 'effects', 'enforceable', 'epa', 'guidelines', 'however', 'in', 'may', 'national', 'non', 'not', 'nsdwrs', 'odor', 'or', 'recommends', 'regulating', 'regulations', 'require', 'secondary', 'skin', 'standard', 'standards', 'states', 'such', 'systems', 'taste', 'that', 'the', 'them', 'to', 'tooth', 'water', 'with']
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.39137817 0.         0.         0.         0.
  0.         0.         0.         0.49641358 0.         0.
  0.39137817 0.         0.         0.         0.         0.49641358
  0.         0.31685436 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.31685436 0.        ]
 [0.         0.14713048 0.14713048 0.23199872 0.         0.147130

In [10]:
# We can use relative word occurence (similarity) to measure similarity between documents
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
for i in range(1, len(corpus)):
    print ("similarity of doc-1 (" + 
           corpus[0] + ") with \n -> doc-" + 
           str(i+1) + " (" + corpus[i] + ") is = "  + 
           str(cosine_similarity (X[0], X[i])))


similarity of doc-1 (National Secondary Drinking Water Regulations (NSDWRs)) with 
 -> doc-2 (NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water) is = [[0.15031176]]
similarity of doc-1 (National Secondary Drinking Water Regulations (NSDWRs)) with 
 -> doc-3 ( EPA recommends secondary standards to water systems but does not require systems to comply with the standard) is = [[0.09611996]]
similarity of doc-1 (National Secondary Drinking Water Regulations (NSDWRs)) with 
 -> doc-4 ( However, states may choose to adopt them as enforceable standards) is = [[0.]]
