## Numericalization with scikit-learn

In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

import string

In [38]:
corpus = [
 'This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?',
 '@user This one is a tweet #meta ;)' 
]

In [39]:
vectorizer = CountVectorizer()

In [40]:
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'meta', 'one', 'second', 'the', 'third', 'this', 'tweet', 'user']


In [41]:
print(X.toarray())

[[0 1 1 1 0 0 0 1 0 1 0 0]
 [0 2 0 1 0 0 1 1 0 1 0 0]
 [1 0 0 1 0 1 0 1 1 1 0 0]
 [0 1 1 1 0 0 0 1 0 1 0 0]
 [0 0 0 1 1 1 0 0 0 1 1 1]]


In [42]:
CountVectorizer??

In [49]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
tokenize_funct = tokenizer.tokenize
word_blacklist = stopwords.words('english') + list(string.punctuation)
vectorizer_tweet = CountVectorizer(tokenizer=tokenize_funct, stop_words=word_blacklist)

In [50]:
X1 = vectorizer_tweet.fit_transform(corpus)
print(vectorizer_tweet.get_feature_names())

['#meta', ';)', 'document', 'first', 'one', 'second', 'third', 'tweet']


In [60]:
print(X1.toarray())

[[0 0 1 1 0 0 0 0]
 [0 0 2 0 0 1 0 0]
 [0 0 0 0 1 0 1 0]
 [0 0 1 1 0 0 0 0]
 [1 1 0 0 1 0 0 1]]


In [61]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())

['and this', 'document is', 'first document', 'is the', 'is this', 'is tweet', 'one is', 'second document', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this one', 'this the', 'tweet meta', 'user this']


In [136]:
vectorizer_tfidf = TfidfVectorizer(stop_words=word_blacklist)
X3 = vectorizer_tfidf.fit_transform(corpus)
print(vectorizer_tfidf.get_feature_names())

['document', 'first', 'meta', 'one', 'second', 'third', 'tweet', 'user']


In [137]:
X3.toarray()

array([[0.63871058, 0.76944707, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.80130969, 0.        , 0.        , 0.        , 0.59824977,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.77828292, 0.        , 0.        ],
       [0.63871058, 0.76944707, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.52335825, 0.42224214, 0.        ,
        0.        , 0.52335825, 0.52335825]])

## Computing similarities

In [138]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity

In [139]:
query = 'looking for the first document'
Xquery = vectorizer_tfidf.transform([query])
print(Xquery.toarray())

[[0.63871058 0.76944707 0.         0.         0.         0.
  0.         0.        ]]


In [140]:
for i, d1 in enumerate(X3):
    print("Doc %d - query similarity: %f ('%s')" % (i, cosine_similarity(d1,Xquery), corpus[i]))

Doc 0 - query similarity: 1.000000 ('This is the first document.')
Doc 1 - query similarity: 0.511805 ('This document is the second document.')
Doc 2 - query similarity: 0.000000 ('And this is the third one.')
Doc 3 - query similarity: 1.000000 ('Is this the first document?')
Doc 4 - query similarity: 0.000000 ('@user This one is a tweet #meta ;)')


In [141]:
from scipy.spatial.distance import cdist
import numpy as np

dist = cdist(X3.toarray(), Xquery.toarray(), metric='cosine')
dist

array([[0.        ],
       [0.48819503],
       [1.        ],
       [0.        ],
       [1.        ]])

In [142]:
similarity_rank = np.argsort(dist, axis=0)
similarity_rank

array([[0],
       [3],
       [1],
       [2],
       [4]])

In [143]:
print("Searching for '%s'..." % query)
print("Ranked results:")
for i in similarity_rank.flatten():
    print("\t", corpus[i], dist[i])

Searching for 'looking for the first document'...
Ranked results:
	 This is the first document. [0.]
	 Is this the first document? [0.]
	 This document is the second document. [0.48819503]
	 And this is the third one. [1.]
	 @user This one is a tweet #meta ;) [1.]
