## TF-IDF的主要思想：
### 如果某個詞或短語在一篇文章中出現的頻率TF高，並且在其他文章中很少出現，
### 則認為此詞或者短語具有很好的類別區分能力，適合用來分類。

In [1]:
import re
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import jieba
import time
import sys

### 連接MongoDB

In [2]:
client = MongoClient('127.0.0.1',27017)
database = client['mongo']
collection =database['ptt']

### 連接中文詞庫及自建詞庫

In [3]:
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict("dict_keyw.txt")     #加入自建詞庫
jieba.load_userdict("dict_cbdic.txt")

Building prefix dict from C:\Users\JAMES\dict.txt.big ...
Loading model from cache C:\Users\JAMES\AppData\Local\Temp\jieba.ubc5c2e475c644ff0bba030882b6c29ac.cache
Loading model cost 1.865 seconds.
Prefix dict has been built succesfully.


In [4]:
def tfIdf(content):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(content)  
    weight = X.toarray()
    features = vectorizer.get_feature_names()
    print ("特徵值數量:",len(features))
    return weight, features

In [5]:
def getTopWeight(weight, features, resp_count, top_n):
    top_features = []
    for n in range(0,resp_count):
        indices = np.argsort(weight[n])[::-1]
        top_features.append([features[i] for i in indices[:top_n]])
    return top_features    

In [12]:
time_start = time.clock()
content = []

tag1 = "颱風"
tag2 = "台灣"

querry_resp = collection.find({"$and":[
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
        ]})
print("查詢結果的第一筆資料標題:",querry_resp[0]["title"])
#print('--------------------------------------------------------')
#print("查詢結果的第一筆資料內容:",querry_resp[0]["content"])

for post in querry_resp:
    summary = post['content']
    content.append('/'.join(jieba.cut(summary)))
    
#總文章數量
resp_count = querry_resp.count()
print ("查詢結果資料總筆數:", resp_count)
time_step1 = time.clock()
print("querry+jieba 的時間", time_step1-time_start,"秒")

weight, features = tfIdf(content)
time_step2 = time.clock()
print("計算tf-idf的時間",time_step2-time_step1,"秒")
top_num = 20
tf_idf_resp = getTopWeight(weight, features, resp_count, top_num)
time_step3 = time.clock()
print("排序與取值的時間",time_step3-time_step2,"秒")

查詢結果的第一筆資料標題: Re: [問卦] 台中有成為新的首都的潛力嗎?
查詢結果資料總筆數: 208
querry+jieba 的時間 2.974297946845013 秒
特徵值數量: 14962
計算tf-idf的時間 0.14979071771517738 秒
排序與取值的時間 0.06742327870597364 秒


In [13]:
import numpy as np

print("總共有",resp_count * top_num,"個詞")
dic = {} #統計出現次數
for top20_list in tf_idf_resp:
    for ele in top20_list:
        if not ele in dic:
            dic[ele] = 1
        else:
            dic[ele] = dic[ele] + 1
print("其中共有",len(dic),"個不重複的詞")

lis = sorted(dic.items(),key = lambda dic:dic[1],reverse = True)

total = 0.0
for ele in lis[:]:
    total += ele[1]
#print("詞的總次數",total)

mean = total / len(dic)
print("每個詞平均重複出現",mean,"次")
#max_count_item = max(lis, key=lambda x:x[1])# 出現次數最多
#min_count_item = min(lis, key=lambda x:x[1])# 出現次數最少
#print (max_count_item[0], min_count_item[0])


#求標準差
err = 0
for ele in lis[:]:
    x = ele[1]
    err += (x-mean)**2
sd = (err/len(dic))**0.5
print("標準差",sd,"次")
print("-----------------------")
print("列出次數最高的前20個詞與次數:")
for ele in lis[0:20]:
    print(ele[0],ele[1])
print("-----------------------")    
#根據Chebyshev不等式(Chebyshev's Inequality)
#mean的1個標準差範圍內至少包含約50%的資料
#mean的2個標準差範圍內至少包含約75%的資料
#mean的3個標準差範圍內至少包含約89%的資料

treshold = mean+2*sd
print("門檻設定為mean的2個標準差範圍",treshold,"次") #大於此門檻次數的詞就是保證排名在前25%內的詞

count = 0.0
for ele in lis[:]:
    if ele[1] > treshold:
        count += 1
print("共",count,"個詞大於門檻")
rank = count/len(dic)
print("第",count,"的詞實際排在前",rank*100,"%")

總共有 4160 個詞
其中共有 3423 個不重複的詞
每個詞平均重複出現 1.21530820917324 次
標準差 0.8272170256640892 次
-----------------------
列出次數最高的前20個詞與次數:
颱風假 15
柯文哲 12
補班 12
放假 11
地震 11
日本 10
柯p 10
資方 9
颱風 9
就是 8
勞工 8
完整 8
沒有 8
台灣 8
台北 7
市長 7
補課 7
公司 6
現在 6
一個 6
-----------------------
門檻設定為mean的2個標準差範圍 2.8697422605014182 次
共 140.0 個詞大於門檻
第 140.0 的詞實際排在前 4.08997955010225 %


## 購物籃分析

In [14]:
time_step4 = time.clock()
D = apriori(tf_idf_resp,rank)# 購物籃分析
time_step5 = time.clock()
print("購物籃分析的時間",time_step5-time_step4,"秒")
print()

print("傳回",len(D),"組關聯分析結果:")
print("與查詢詞",tag1,tag2,"常出現的詞:")

for ele in D:
    if len(ele)>2:
        print(",".join(ele))
    else:
        print(ele[0])

購物籃分析的時間 0.05496893096915301 秒

傳回 9 組關聯分析結果:
與查詢詞 颱風 台灣 常出現的詞:
地震
放假
日本
柯p
柯文哲
補班
資方
颱風
颱風假


In [10]:
def apriori(D, minSup):
    C1 = {}
    for T in D:
        for I in T:
            if I in C1:
                C1[I] += 1
            else:
                C1[I] = 1
    _keys1 = C1.keys()
    
    keys1 = []
    for i in _keys1:
        keys1.append([i])
    
    
    n = len(D)
    cutKeys1 = []
    for k in keys1[:]:
        if C1[k[0]]*1.0/n >=minSup:
            cutKeys1.append(k)
    cutKeys1.sort()
    
    keys = cutKeys1
    all_keys = []
    while keys != []:
        C = getC(D, keys)
        cutKeys = getCutKeys(keys, C, minSup, D)
        for key in cutKeys:
            all_keys.append(key)
        keys = aproiri_gen(cutKeys)
    return all_keys    

def getC(D, keys):
    C = []
    for key in keys:
        c = 0
        for T in D:
            have = True
            for k in key:
                if k not in T:
                    have = False
            if have:
                c += 1
        C.append(c)
    return C   
    
def getCutKeys(keys, C, minSup, D):
    '''判斷這個項目有沒有大於最小支持數'''
    for key in keys[:]:
        num = 0
        for T in D:
            if keyInT(key, T):
                num += 1
        if num * 1.0 / len(D) < minSup:
            keys.remove(key)

    return keys

def keyInT(key, T):
    '''判斷項目keys有沒有在項目集T裡面'''
    for k in key:
        if k not in T:
            return False
    return True

def aproiri_gen(keys1):
    '''連起來'''
    keys2 = []
    for k1 in keys1:
        for k2 in keys1:
            if k1 != k2:
                key = []
                for k in k1:
                    if k not in key:
                        key.append(k)
                for k in k2:
                    if k not in key:
                        key.append(k)
                key.sort()
                if key not in keys2:
                    keys2.append(key)
    return keys2    