In [1]:
#encoding=utf-8
# 基本部件的引用声明
import sys
import re
import codecs
import os
import shutil
import jieba
import jieba.analyse
import graphlab
import numpy as np
from array import array
from collections import Counter

### 首先是从文件中的读取以及分词


此处用的是 *Jieba* 中文分词器，在这里我们将中文文本分词之后以空格分隔各词然后输出到目标文件夹

函数定义中需传入 **文件目录的绝对地址** 以及 *文本总数量*

格式要求：
   * 文本文件以UTF-8格式编码，文本名称为 'xxxx(四位数字).txt'
   * 传入目标目录为绝对地址
   * 文本总数量为需要读取的文本数量
   * 文件源目录存在时，请确保 '传入目录名_Result' 文件夹存在

In [2]:
def read_file_cut(path , file_count):
    source_path = path + "/"
    result_path = path + "_Result/"
    if os.path.isdir(result_path):
        shutil.rmtree(result_path, True)
    os.makedirs(result_path)
    num = 1

    while num in range(file_count):
        name = "%04d" % num 
        fileName = source_path + str(name) + ".txt"
        resName = result_path + str(name) + ".txt"
        source = open(fileName, 'r')

        if os.path.exists(resName):
            os.remove(resName)
        result = codecs.open(resName, 'w', 'utf-8')
        line = source.readline()
        line = line.rstrip('\n')
       
        while line != "":
            line = unicode(line, "utf-8")
            seglist = jieba.cut(line , cut_all = False)  #精确模式
            output = ' '.join(list(seglist))         #空格拼接
            #print output
            result.write(output + '\r\n')
            line = source.readline()
        else:
            print 'End file: ' + str(num)

        source.close()
        result.close()
        num = num + 1
    else:
        print 'End All'

### 接下来这个函数是用来提取*词数向量*的

* 在分析过程中，可以频繁的调用该函数，需要提供文件的 **绝对路径**
* 返回值为一个 *Dict<单词，出现次数>* ,
* 返回的Dict没有依据单词出现的次数排序
* 如果用向量实现并不现实，所以在此处我们使用Dict来实现词数向量的功能

In [3]:
def calculate_word_count(path):
    txt = open(path , "r").read()
    
    words = [w.strip() for w in txt.split() if w.strip()]
    counter = Counter()

    for w in words:
        counter[w] += 1

    kv = counter.items()   #这是键值对的列表
    wc_dict = dict(counter)    #这是词典
    kv.sort(key = lambda x:x[1] , reverse = True)
    #可以用列表做其他事，但只返回词典
    return wc_dict

### 接下来是计算两个词数向量间的距离

* euc_dist 计算两个词数向量之间的欧几里得距离 返回值为浮点数
* cos_dist 计算两个词数向量之间的余弦距离 返回值为浮点数
* jac_dist 计算两个词数向量之间的加权杰卡德距离 返回值为浮点数

注意到，此处只是使用计算距离对词数向量进行Brute-Force计算，很容易把一些常见的词错误的认为是判断两个文本文档相类似的函数的主要依据（如 **"的"** , **"和"** , **"是"**  , **"了"**  , etc）。如果需要计算更精确的数值，应该使用 **TF-IDF** 来进行计算,使用者在使用前应该调用 *calc_tf_idf* 方法。

* 注意, *calc_tf_idf* 方法需要使用者传入所有文本（whole documents）的 **SArray词典集合（SArray of dict）** 返回值为SArray的词典集合类型（每行分别为原来单词的dict的SArray集合）。
* 示例使用方法：
    ```python
        docs['TF_IDF'] = calc_tf_idf(docs['wc_dict'])
    ```
    这样处理之后还是可以通过
    ```python
    docs[docs['dName'] == 'docnamehere'][0]
    ```
    找到某行。

In [4]:
def euc_dist(dict_a , dict_b):
    return graphlab.distances.euclidean(dict_a , dict_b)

def cos_dist(dict_a , dict_b):
    return graphlab.distances.cosine(dict_a , dict_b)

def jac_dist(dict_a , dict_b):
    return graphlab.distances.weighted_jaccard(dict_a , dict_b)

def calc_tf_idf(whole_document_dict):
    return graphlab.text_analytics.tf_idf(whole_document_dict)

### 给某一个特定的新闻找到10个左右Nearnest Neighbour

由于我们在数据库中存放的是微博，所以我们简单的给定 *新闻文档* ，然后给其适配最适合的10个微博。

In [5]:
# 不管传入的是否是TF-IDF，要求传的新闻词数向量为Dictionary，whole_document_vec为dict的SFrame
def find_NN(news_vec , whole_document_vec):
    process_vec = graphlab.SFrame()
    process_vec['original'] = whole_document_vec
    result_value = []
    result_id = []

    for i in range(len(whole_document_vec)):
        result_id.append(i)
        cos_val = euc_dist(whole_document_vec[i] , news_vec);
        result_value.append(cos_val)

    process_vec['cosine'] = result_value
    process_vec['id'] = result_id

    my = process_vec.sort('cosine' , ascending=True)

    if len(process_vec) < 10:
        return my
    else:
        return my[0:10]

## 基础处理的函数部分完成，下面测试函数的正确性

In [6]:
# test file readCut
src_path = "/home/souler/ml-wst/classification/test"
src_count = 15

read_file_cut(src_path , src_count)

Building prefix dict from the default dictionary ...
2016-07-15 17:53:25,488 [DEBUG] jieba, 111: Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2016-07-15 17:53:25,491 [DEBUG] jieba, 131: Loading model from cache /tmp/jieba.cache
Loading model cost 0.480 seconds.
2016-07-15 17:53:25,970 [DEBUG] jieba, 163: Loading model cost 0.480 seconds.
Prefix dict has been built succesfully.
2016-07-15 17:53:25,972 [DEBUG] jieba, 164: Prefix dict has been built succesfully.


End file: 1
End file: 2
End file: 3
End file: 4
End file: 5
End file: 6
End file: 7
End file: 8
End file: 9
End file: 10
End file: 11
End file: 12
End file: 13
End file: 14
End All


In [7]:
#Test Word Counter
res_path = src_path + '_Result/'
tmp_counter = 1
res_dict = []

while tmp_counter in range(src_count):
    tmp_file = "%04d" % tmp_counter
    tmp_src_path = res_path + tmp_file + ".txt"
    tmp_dict = calculate_word_count(tmp_src_path)
    res_dict.append(tmp_dict)
    tmp_counter = tmp_counter + 1

In [8]:
print "Result_Dictionary Should be 14"
print "Your Result Dictionary Length: %d" % len(res_dict) 
print ""
print "第一篇文章的词数向量(字典)"
print res_dict[0]

#如果不想看直接就过
#如果想看中文的话，使用decode就行
# x.decode('utf-8')

Result_Dictionary Should be 14
Your Result Dictionary Length: 14

第一篇文章的词数向量(字典)
{'\xe5\xb0\x8f\xe6\x97\xb6': 2, '\xe8\xbe\x93\xe5\x85\xa5': 2, '\xe9\x99\x8d\xe8\x87\xb3': 1, '\xe6\x9f\x90\xe7\xa7\x8d': 1, '\xe7\xba\xa7': 1, '\xe5\x91\xbd\xe8\xbf\x90': 1, '\xe8\xbd\xbb\xe5\x9e\x8b': 4, '\xe6\x9b\xb4\xe6\x96\xb0': 1, '\xe5\x9f\xba\xe4\xba\x8e': 1, '\xe5\xae\xb9\xe7\xba\xb3': 1, '\xe4\xb9\x8b': 1, '\xe7\xba\xa6\xe5\x90\x88': 4, '5500': 1, '\xe7\xa7\x91\xe6\x8a\x80': 1, '\xe4\xb8\x89\xe5\xb9\xb4': 1, '\xe5\x9c\xb0': 2, '\xe4\xb8\x8d': 4, '\xe6\x89\x8b\xe6\x9c\xba': 1, '\xe5\x9c\xa8': 4, '\xe8\xbf\x99\xe4\xba\x9b': 1, '\xe6\x96\xb0\xe5\xa2\x9e': 2, '\xe5\xbe\x97\xe5\x88\x86': 1, '\xe4\xbc\xa0\xe7\xbb\x9f': 1, '\xe8\xbf\x9b\xe8\xa1\x8c': 1, '\xe6\x9c\xac\xe8\xb4\xa8': 1, '\xe5\xbc\xba\xe5\xa4\xa7': 1, '\xe9\xab\x98\xe7\xab\xaf': 2, '\xe8\xae\xbe\xe5\xa4\x87': 3, '\xe5\xa4\xb1\xe6\x9c\x9b': 1, '\xe7\xa9\xba\xe9\x97\xb4': 2, '\xe5\x87\x86\xe7\xa1\xae': 1, '999': 2, '\xe4\xba\x8e': 1, '\xe4\xb

In [9]:
#test euclidean calculator
print "测试欧几里得距离："
print "第一篇文章和第二篇文章的欧几里得距离是: %f" % euc_dist(res_dict[0] , res_dict[1])
print "第十二篇文章和第二篇文章的欧几里得距离是: %f" % euc_dist(res_dict[11] , res_dict[1])
print "----------------------------------------------------------------------"
#test cosine calculator
print "测试余弦距离："
print "第一篇文章和第二篇文章的余弦距离是: %f" % cos_dist(res_dict[0] , res_dict[1])
print "第十二篇文章和第二篇文章的余弦距离是: %f" % cos_dist(res_dict[1] , res_dict[11])
print "----------------------------------------------------------------------"
#test jaccard calculator
print "测试带权杰卡德距离："
print "第一篇文章和第二篇文章的带权杰卡德距离是: %f" % jac_dist(res_dict[0] , res_dict[1])
print "第十二篇文章和第二篇文章的带权杰卡德距离是: %f" % jac_dist(res_dict[1] , res_dict[11])
print "----------------------------------------------------------------------"

测试欧几里得距离：
This non-commercial license of GraphLab Create is assigned to ouyf5@mail2.sysu.edu.cn and will expire on November 25, 2016. For commercial licensing options, visit https://turi.com/buy/.


2016-07-15 17:54:00,682 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1468576419.log


第一篇文章和第二篇文章的欧几里得距离是: 132.491509
第十二篇文章和第二篇文章的欧几里得距离是: 55.695601
----------------------------------------------------------------------
测试余弦距离：
第一篇文章和第二篇文章的余弦距离是: 0.333527
第十二篇文章和第二篇文章的余弦距离是: 0.407858
----------------------------------------------------------------------
测试带权杰卡德距离：
第一篇文章和第二篇文章的带权杰卡德距离是: 0.929072
第十二篇文章和第二篇文章的带权杰卡德距离是: 0.908810
----------------------------------------------------------------------


In [10]:
# testing tf-idf parsing
res_sarray = graphlab.SArray(res_dict)
res_tf_idf = calc_tf_idf(res_sarray)


#Retest the distances
print "测试TF-IDF下的欧几里得距离："
print "第一篇文章和第二篇文章的欧几里得距离是: %f" % euc_dist(res_tf_idf[0] , res_tf_idf[1])
print "第十二篇文章和第二篇文章的欧几里得距离是: %f" % euc_dist(res_tf_idf[1] , res_tf_idf[11])
print "----------------------------------------------------------------------"

print "测试TF-IDF下的余弦距离："
print "第一篇文章和第二篇文章的余弦距离是: %f" % cos_dist(res_tf_idf[0] , res_tf_idf[1])
print "第十二篇文章和第二篇文章的余弦距离是: %f" % cos_dist(res_tf_idf[1] , res_tf_idf[11])
print "----------------------------------------------------------------------"

print "测试TF-IDF下的带权杰卡德距离："
print "第一篇文章和第二篇文章的带权杰卡德距离是: %f" % jac_dist(res_tf_idf[0] , res_tf_idf[1])
print "第十二篇文章和第二篇文章的带权杰卡德距离是: %f" % jac_dist(res_tf_idf[1] , res_tf_idf[11])
print "----------------------------------------------------------------------"

测试TF-IDF下的欧几里得距离：
第一篇文章和第二篇文章的欧几里得距离是: 93.607117
第十二篇文章和第二篇文章的欧几里得距离是: 75.697636
----------------------------------------------------------------------
测试TF-IDF下的余弦距离：
第一篇文章和第二篇文章的余弦距离是: 0.930494
第十二篇文章和第二篇文章的余弦距离是: 0.969091
----------------------------------------------------------------------
测试TF-IDF下的带权杰卡德距离：
第一篇文章和第二篇文章的带权杰卡德距离是: 0.972891
第十二篇文章和第二篇文章的带权杰卡德距离是: 0.977019
----------------------------------------------------------------------


In [11]:
#testing Nearest Neighbour
res_nn = find_NN(res_tf_idf[0] , res_tf_idf)
#res_nn_1 = find_NN(res_dict[0] , res_dict)

In [12]:
res_nn

original,cosine,id
{'\xe5\xb0\x8f\xe6\x97\xb 6': 3.080890081894298 ...,0.0,0
{'\xe5\x94\xae\xe4\xbb\xb 7': 1.029619417181158 ...,93.3766286501,6
"{'\xe6\x9c\x89': 0.6931471805599453, ...",93.4476774541,5
"{'\xe6\x9c\x89': 0.6931471805599453, ' ...",93.6071171083,1
{'\xe4\xb8\x89\xe6\x98\x9 f': 1.540445040947149 ...,94.3435572299,7
"{'\xe6\x9c\x89': 0.6931471805599453, ' ...",94.3776712023,8
"{'\xe6\x9c\x89': 2.0794415416798357, ...",94.6005774246,2
{'\xe4\xb8\x89\xe6\x98\x9 f': 1.540445040947149 ...,94.7059817988,9
"{'\xe6\x9c\x88': 2.0794415416798357, ' ...",94.7657640934,3
"{'\xe6\x9c\x88': 1.3862943611198906, ' ...",95.299432307,4


### 函数测试完成，下面是初步match方法

In [13]:
#process global documents and put them into an array by id
#Can also calculate TF-IDF formats
def generate_docset_vec(path , count , TF_SELECTOR = False):
    res_path = path + '_Result/'
    tmp_counter = 1
    res_dict = []

    while tmp_counter in range(count + 1):
        tmp_file = "%04d" % tmp_counter
        tmp_src_path = res_path + tmp_file + ".txt"
        tmp_dict = calculate_word_count(tmp_src_path)
        res_dict.append(tmp_dict)
        tmp_counter = tmp_counter + 1
    res_sarray = graphlab.SArray(res_dict)

    if TF_SELECTOR:
        res_tf = calc_tf_idf(res_sarray)
        return res_tf
    else:
        return res_sarray

In [14]:
#This function is just uses for reduce code length
#Ignore it
def judge_and_cut(path , count , name):
    if os.path.isdir(path):
        read_file_cut(path , count)
        return False
    else:
        print "Error: " + name + " path Error!"
        return True

In [15]:
#Main process
#Result contains a SFrame that contains news raw word_vec and matching weibo
def pre_process(weibo_abs_path , weibo_count , news_abs_path , news_count):
    # Handle Weibo , cutting into words
    if judge_and_cut(weibo_abs_path , weibo_count , "Weibo") or judge_and_cut(news_abs_path , news_count , "News"):
        return False
    #NOW perform checking if no error occured
    weibo_dest_path = weibo_abs_path + "_Result/"
    weibo_dicts = generate_docset_vec(weibo_dest_path , weibo_count , True)
    news_dest_path = news_abs_path + "_Result/"
    news_dicts = generate_docset_vec(news_dest_path , news_count , True)

    #Now we have 2 dictionary sets, Go for every news to prepare for processing
    #Create SFrame for News for storaging
    news_data = graphlab.SFrame()
    news_data['raw_dict'] = news_dicts
    news_nns = []
    #
    for i in range(news_count):
        temp_nns = find_NN(news_dicts[i] , weibo_dicts)
        news_nns.append(temp_nns)
    news_data['assign_weibos'] = news_nns
    return news_data
