# We look at TF/IDF as a way to represent words and context

In [1]:
# Based on sci-kit documentation
# Imports
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
corpus = [
'An alpha document.',
'A beta document.',
'Guten Morgen!',
'Gamma manuscript is old.',
'Whither my document?',
]

In [3]:
# Single word representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['alpha', 'an', 'beta', 'document', 'gamma', 'guten', 'is', 'manuscript', 'morgen', 'my', 'old', 'whither']
[[1 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0]
 [0 0 0 0 1 0 1 1 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 1 0 1]]


In [4]:
# N-gram representation (2- and 3-; word based)
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 3))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())

['alpha document', 'an alpha', 'an alpha document', 'beta document', 'gamma manuscript', 'gamma manuscript is', 'guten morgen', 'is old', 'manuscript is', 'manuscript is old', 'my document', 'whither my', 'whither my document']
[[1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1]]


In [5]:
# N-gram representation (2- and 3-; char based)
vectorizer3 = CountVectorizer(analyzer='char', ngram_range=(2,2))
X3 = vectorizer3.fit_transform(corpus)
print(vectorizer3.get_feature_names())
print(X3.toarray())

[' a', ' b', ' d', ' i', ' m', ' o', 'a ', 'al', 'am', 'an', 'be', 'cr', 'cu', 'd.', 'do', 'en', 'er', 'et', 'ga', 'ge', 'gu', 'ha', 'he', 'hi', 'ip', 'is', 'it', 'ld', 'lp', 'ma', 'me', 'mm', 'mo', 'my', 'n ', 'n!', 'nt', 'nu', 'oc', 'ol', 'or', 'ph', 'pt', 'r ', 'rg', 'ri', 's ', 'sc', 't ', 't.', 't?', 'ta', 'te', 'th', 'um', 'us', 'ut', 'wh', 'y ']
[[1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0
  1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0]
 [0 1 1 0 0 0 2 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1
  0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 2 0 1 0 0 0 0
  0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0
  1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1]]


# Contextual Representation Using IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# TFIDR Vectorizer gives value based on Inverse Document Frequency, i.e., relative
# occurence of words in the documents. Hence, context is by word frequency.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['alpha', 'an', 'beta', 'document', 'gamma', 'guten', 'is', 'manuscript', 'morgen', 'my', 'old', 'whither']
[[0.63907044 0.63907044 0.         0.42799292 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.83088075 0.55645052 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.         0.70710678 0.         0.         0.        ]
 [0.         0.         0.         0.         0.5        0.
  0.5        0.5        0.         0.         0.5        0.        ]
 [0.         0.         0.         0.42799292 0.         0.
  0.         0.         0.         0.63907044 0.         0.63907044]]


In [8]:
# We can use relative word occurence (similarity) to measure similarity between documents

In [9]:
from sklearn.metrics.pairwise import cosine_similarity 

In [10]:
for i in range(1, len(corpus)):
    print ("similarity of doc-1 (" + corpus[0] + ") with " + str(i) + "(" + corpus[i] + ") is = "  + str(cosine_similarity (X[0], X[i])))

similarity of doc-1 (An alpha document.) with 1(A beta document.) is = [[0.23815688]]
similarity of doc-1 (An alpha document.) with 2(Guten Morgen!) is = [[0.]]
similarity of doc-1 (An alpha document.) with 3(Gamma manuscript is old.) is = [[0.]]
similarity of doc-1 (An alpha document.) with 4(Whither my document?) is = [[0.18317794]]
