## We look at different vector representations for words

In [1]:
# Based on sci-kit documentation
# Imports
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
corpus = [
'This is an apple',
'These are apples',
'This is an apples',
'There are apply'
]

In [3]:
corpus2 = [
'An alpha document.',
'A beta document.',
'Guten Morgen!',
'Gamma manuscript is old.',
'Whither my document?',
'one two alpha beta'
]

In [4]:
# Single word representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['an', 'apple', 'apples', 'apply', 'are', 'is', 'there', 'these', 'this']
[[1 1 0 0 0 1 0 0 1]
 [0 0 1 0 1 0 0 1 0]
 [1 0 1 0 0 1 0 0 1]
 [0 0 0 1 1 0 1 0 0]]




In [5]:
res = X.toarray()
res[0]

array([1, 1, 0, 0, 0, 1, 0, 0, 1])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Measuring distance between first and other docs
for i in range(len(res)):
    print ("Distance with doc - ", i , " is - ", cosine_similarity(res[0].reshape(1, -1), res[i].reshape(1, -1)))

Distance with doc -  0  is -  [[1.]]
Distance with doc -  1  is -  [[0.]]
Distance with doc -  2  is -  [[0.75]]
Distance with doc -  3  is -  [[0.]]


In [8]:
# N-gram representation (2- and 3-; word based)
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 3))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())

['an apple', 'an apples', 'are apples', 'are apply', 'is an', 'is an apple', 'is an apples', 'there are', 'there are apply', 'these are', 'these are apples', 'this is', 'this is an']
[[1 0 0 0 1 1 0 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 1 1 0 0]
 [0 1 0 0 1 0 1 0 0 0 0 1 1]
 [0 0 0 1 0 0 0 1 1 0 0 0 0]]




In [9]:
res = X2.toarray()
res[0]

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1])

In [10]:
# Measuring distance between first and other docs
for i in range(len(res)):
    print ("Distance with doc - ", i , " is - ", cosine_similarity(res[0].reshape(1, -1), res[i].reshape(1, -1)))

Distance with doc -  0  is -  [[1.]]
Distance with doc -  1  is -  [[0.]]
Distance with doc -  2  is -  [[0.6]]
Distance with doc -  3  is -  [[0.]]


In [11]:
# N-gram representation (2- and 3-; char based)
vectorizer3 = CountVectorizer(analyzer='char', ngram_range=(2,2))
X3 = vectorizer3.fit_transform(corpus)
print(vectorizer3.get_feature_names())
print(X3.toarray())

[' a', ' i', 'an', 'ap', 'ar', 'e ', 'er', 'es', 'he', 'hi', 'is', 'le', 'ly', 'n ', 'pl', 'pp', 're', 's ', 'se', 'th']
[[2 1 1 1 0 0 0 0 0 1 2 1 0 1 1 1 0 2 0 1]
 [2 0 0 1 1 2 0 2 1 0 0 1 0 0 1 1 1 0 1 1]
 [2 1 1 1 0 0 0 1 0 1 2 1 0 1 1 1 0 2 0 1]
 [2 0 0 1 1 2 1 0 1 0 0 0 1 0 1 1 2 0 0 1]]




In [12]:
res = X3.toarray()
res[0]

array([2, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 0, 2, 0, 1])

In [13]:
# Measuring distance between first and other docs
for i in range(len(res)):
    print ("Distance with doc - ", i , " is - ", cosine_similarity(res[0].reshape(1, -1), res[i].reshape(1, -1)))

Distance with doc -  0  is -  [[1.]]
Distance with doc -  1  is -  [[0.42857143]]
Distance with doc -  2  is -  [[0.97700842]]
Distance with doc -  3  is -  [[0.39036003]]


## Contextual Representation Using TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# TFIDR Vectorizer gives value based on Inverse Document Frequency, i.e., relative
# occurence of words in the documents. Hence, context is by word frequency.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['an', 'apple', 'apples', 'apply', 'are', 'is', 'there', 'these', 'this']
[[0.46580855 0.59081908 0.         0.         0.         0.46580855
  0.         0.         0.46580855]
 [0.         0.         0.52640543 0.         0.52640543 0.
  0.         0.66767854 0.        ]
 [0.5        0.         0.5        0.         0.         0.5
  0.         0.         0.5       ]
 [0.         0.         0.         0.61761437 0.48693426 0.
  0.61761437 0.         0.        ]]




In [16]:
# We can use relative word occurence (similarity) to measure similarity between documents

In [17]:
from sklearn.metrics.pairwise import cosine_similarity 

In [18]:
for i in range(1, len(corpus)):
    print ("similarity of doc-1 (" + corpus[0] + ") with " + str(i) + "(" + corpus[i] + ") is = "  + str(cosine_similarity (X[0], X[i])))

similarity of doc-1 (This is an apple) with 1(These are apples) is = [[0.]]
similarity of doc-1 (This is an apple) with 2(This is an apples) is = [[0.69871282]]
similarity of doc-1 (This is an apple) with 3(There are apply) is = [[0.]]
