In [92]:
#################################
### First Similarity Measurement 
#################################

#################################
### ??? Normailized and raw ??? 
#################################

from sklearn.feature_extraction.text import CountVectorizer

import scipy as sp

def dist_raw(v1,v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

# Normalizing word count vectors
# calculate the vector distance not on the raw vectors but on the normalized instead
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

# Convert a collection of text documents to a matrix of token counts
# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
# 只考虑词汇在文本中出现的频率
vectorizer = CountVectorizer(min_df=1)
content = ["How to format my hard disk", " Hard disk format problem "]
# fit_transform(trainData)对部分数据先拟合fit，找到该part的整体指标，如均值、方差、最大值最小值等等（根据具体转换的目的），然后对该trainData进行转换transform，从而实现数据的标准化、归一化等等。
X = vectorizer.fit_transform(content)
print(X)
print(X.toarray())
print(vectorizer.get_feature_names())
# transpose 数组转置
print(X.toarray().transpose())

os.listdir('./data/toy/')

  (0, 3)	1
  (0, 6)	1
  (0, 1)	1
  (0, 4)	1
  (0, 2)	1
  (0, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 0)	1
  (1, 5)	1
[[1 1 1 1 1 0 1]
 [1 1 1 0 0 1 0]]
['disk', 'format', 'hard', 'how', 'my', 'problem', 'to']
[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


['02.txt', '05.txt', '01.txt', '04.txt', '03.txt']

##  <font color='red'>fit and transform</font> 
<ol>
  <li>fit: 求得训练集X的均值，方差，最大值，最小值,这些训练集X固有的属性</li>
  <li>transform: 在fit的基础上，进行标准化，降维，归一化等操作（看具体用的是哪个工具，如PCA，StandardScaler等）</li>
  <li>fit_transform: fit_transform是fit和transform的组合，既包括了训练又包含了转换</li>
  <li>This is question two.</li>
</ol>

## <font color='red' CountVectorizer>CoundVectorizer </font>
CountVectorizer举例，sklearn的CountVectorizer库是根据输入数据获取词频矩阵（稀疏矩阵）

In [94]:
import os

# read files
DIR = "./data/toy/"
posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)]
print(posts)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

X_train = vectorizer.fit_transform(posts)

num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples,num_features))
print(vectorizer.get_feature_names())

new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)
print(new_post_vec.toarray())
new_post_vec_normalized = new_post_vec/sp.linalg.norm(new_post_vec.toarray())
print(new_post_vec_normalized)
print(new_post_vec_normalized.toarray())

['Imaging databases provide storage capabilities.', 'Imaging databases store data. Imaging databases store data. Imaging databases store data.', 'This is a toy post about machine learning. Actually, it contains not much interesting stuff.', 'Imaging databases store data.', 'Most imaging databases save images permanently.\n']
#samples: 5, #features: 17
['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'save', 'storag', 'store', 'stuff', 'toy']
  (0, 4)	1
  (0, 5)	1
[[0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
  (0, 4)	0.7071067811865475
  (0, 5)	0.7071067811865475
[[0.         0.         0.         0.         0.70710678 0.70710678
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]


## <font color='red'>Numpy linalg 库</font>
向量范数、矩阵范数、行列式、矩阵逆、幂
numpy.linalg模块包含线性代数的函数。使用这个模块，我们可以计算逆矩阵、求特征值、解线性方程组以及求解行列式等。
## <font color='red'>sys.maxsize:</font>
## <font color='red'>() and []</font>
## <font color='red'>* and **: 打包和解包</font>


In [95]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
#for i, post in enumerate(num_samples):
for i in range(0, num_samples):
    post = posts[i]
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    if d < best_dist:
        best_dist = d
        best_i = i
    print("=== Post %i with dist=%.2f:%s"%(i,d,post))
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=0.86:Imaging databases provide storage capabilities.
=== Post 1 with dist=0.77:Imaging databases store data. Imaging databases store data. Imaging databases store data.
=== Post 2 with dist=1.41:This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 3 with dist=0.77:Imaging databases store data.
=== Post 4 with dist=0.63:Most imaging databases save images permanently.

Best post is 4 with dist=0.63


In [80]:
# remove less import words --> "test processing"
vectorizer = CountVectorizer(min_df=1, stop_words='english')
sorted(vectorizer.get_stop_words())[0:20]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

In [88]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem("graphics")
s.stem("imagine")
s.stem("bought")

'bought'

In [91]:
# Extending the vectorizer with NLTK's stemmer
# super 用于调用父类(超类)的一个方法
# Python使用lambda关键字创造匿名函数  

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
     
