### Preprocessing

In [1]:
# Converting raw text into a bag of words
# example of using countvector
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
content = ["How to format my hard disk", " Hard disk format problems "]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [2]:
print(X.toarray().transpose())

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


In [3]:
posts =["This is a toy post about machine learning. Actually, it contains not much interesting stuff.",
       "Imaging databases can get huge.", "Most imaging databases save images permanently.", 
       "Imaging databases store images.",
        "Imaging databases store images. Imaging databases store images.Imaging databases store images."]
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples,num_features))

#samples: 5, #features: 24


In [4]:
print(vectorizer.get_feature_names())

['about', 'actually', 'can', 'contains', 'databases', 'get', 'huge', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'save', 'store', 'stuff', 'this', 'toy']


In [5]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)
print(new_post_vec.toarray())

  (0, 4)	1
  (0, 8)	1
[[0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [6]:
import scipy as sp
def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [7]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
samples = [i for i in range(num_samples)]
for i, post in enumerate(samples):
    if posts[post] == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_raw(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[post]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.73: Imaging databases can get huge.
=== Post 2 with dist=2.00: Most imaging databases save images permanently.
=== Post 3 with dist=1.41: Imaging databases store images.
=== Post 4 with dist=5.10: Imaging databases store images. Imaging databases store images.Imaging databases store images.
Best post is 3 with dist=1.41


### Normalizing word count vectors

In [8]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [9]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
samples = [i for i in range(num_samples)]
for i, post in enumerate(samples):
    if posts[post] == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[post]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases can get huge.
=== Post 2 with dist=0.92: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store images.
=== Post 4 with dist=0.77: Imaging databases store images. Imaging databases store images.Imaging databases store images.
Best post is 3 with dist=0.77


In [10]:
#Removing less important words

vectorizer = CountVectorizer(min_df=1, stop_words=posts)
sorted(vectorizer.get_stop_words())[0:20]

['Imaging databases can get huge.',
 'Imaging databases store images.',
 'Imaging databases store images. Imaging databases store images.Imaging databases store images.',
 'Most imaging databases save images permanently.',
 'This is a toy post about machine learning. Actually, it contains not much interesting stuff.']

In [11]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
samples = [i for i in range(num_samples)]
for i, post in enumerate(samples):
    if posts[post] == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[post]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases can get huge.
=== Post 2 with dist=0.92: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store images.
=== Post 4 with dist=0.77: Imaging databases store images. Imaging databases store images.Imaging databases store images.
Best post is 3 with dist=0.77


### Extending the vectorizer with NLTK's stemmer

In [12]:
import nltk
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem("graphics")

'graphic'

In [13]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedCountVectorizer(min_df=1,stop_words=posts)
X1_train = vectorizer.fit_transform(posts)

In [14]:
import scipy as sp
def tfidf(term, doc, corpus):
    tf = doc.count(term) / len(doc)
    num_docs_with_term = len([d for d in corpus if term in d])
    idf = sp.log(len(corpus) / num_docs_with_term)
    return tf * idf

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedTfidfVectorizer(min_df=1,stop_words='english', decode_error='ignore')