In [92]:
import jieba
import jieba.analyse
import sys, math, os, json
from os.path import isfile, join
from os import walk, listdir
import numpy as np

#繁簡轉換
from langconv import *

from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 加載 jieba 用戶字典、停用詞
jieba.load_userdict('userdict.txt')
jieba.analyse.set_stop_words('Chinese_stop.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/hs/lfrgdf0j4fv056xs2p3y5_r40000gn/T/jieba.cache
Loading model cost 0.771 seconds.
Prefix dict has been built succesfully.


In [3]:
english_stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '\n']

In [4]:
# 建立 stopwords 表
with open('Chinese_stop.txt', 'r', encoding = 'utf-8') as f:
    chinese_stop = []
    for word in f:
            # 換行符號只算一個字元
            chinese_stop.append(word[:-1])
    f.close()

In [5]:
# load stories
def loadstories():
    stories_in_folder = [f for f in listdir('stories/') if isfile(join('stories/',f))]

    words_in_each_stories = []

    for story in stories_in_folder:
        with open('stories/' + story, 'r',encoding='utf-8', errors='replace') as f:
            story_temp = f.read()
        f.close()

        story_temp = Converter('zh-hant').convert(story_temp)
        story_words = jieba.cut(story_temp,cut_all = True)
        word_list = ''
        for word in story_words:
            if word in chinese_stop or word.lower() in english_stop:
                continue
            else:
                word_list += word
                word_list += " "
#             print(word_list)
        words_in_each_stories.append(word_list)
    return stories_in_folder, words_in_each_stories

In [107]:
def getweight(v,q):
    corpus = v
    # corpus = [storywords0,storywords1,storywords2]

    #該類會將文本中的詞語轉換為詞頻矩陣，矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    vectorizer = CountVectorizer()

    #該類會統計每個詞語的tf-idf權值
    transformer = TfidfTransformer()

    #第一個fit_transform是計算tf-idf，第二個fit_transform是將文本轉為詞頻矩陣
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))

    #獲取詞袋模型中的所有詞語
    word = vectorizer.get_feature_names()

    #將tf-idf矩陣抽取出來，元素a[i][j]表示j詞在i類文本中的tf-idf權重
    all_w = tfidf.toarray()

    
    #########
    q_words_cut = jieba.cut(q,cut_all='True')
    q_words_string = ''
    for w in q_words_cut:
        q_words_string += w
        q_words_string += ' '

    q_words = [q_words_string]
    # print(q_words)

    q_w = vectorizer.transform(q_words).toarray()
    
    
    return all_w, q_w


    #打印每類文本的tf-idf詞語權重，第一個for遍歷所有文本，第二個for便利某一類文本下的詞語權重
    # for i in range(len(weight)):
    #     print(u"-------這裡輸出第",i,u"類文本的詞語tf-idf權重------")
    #     for j in range(len(word)):
    #         #找出特定相關性以上的詞語
    #         if weight[i][j] > 0.05 :
    #             print(word[j],weight[i][j])

In [64]:
# user 的搜尋語句
def userquery(q):
    vectorizer = CountVectorizer()
    
    q_words_cut = jieba.cut(q,cut_all='True')
    q_words_string = ''
    for word in q_words_cut:
        q_words_string += word
        q_words_string += ' '

    q_words = [q_words_string]
    print(q_words)

    q_tfidf = vectorizer.transform(q_words).toarray()
    return q_tfidf

In [110]:
def getStories(weight_all, q_tfidf, storylist):

    highests = {'a':0}
    
    for index, vector in enumerate(weight_all):    
        vs = cosine_similarity([q_tfidf[0],vector])
        sim = vs[0,1]
        min_in_highests_key = min(highests,key = highests.get)
        min_in_highests = highests[min_in_highests_key]
        # print(min_in_highests_key,min_in_highests)
        if sim > min_in_highests:
            highests[index] = sim
            if len(highests) > 5:
                highests.pop(min_in_highests_key)
        else:
            continue
    # print(highests)
    for h in highests:
        highests[h] = np.float64(highests[h]).item()
        
    result = []
    for h in highests:
        highests[h] = np.float64(highests[h]).item()
        tmp = { 'number': h,
                'title': storylist[h][:-4],
                'score': highests[h]
                }
        result.append(tmp)
        # print(h, highests[h], storylist[h])

    return result

In [58]:
sl , w = loadstories()

In [111]:
q = '今天要吃什麼，想要學一點東西'
# weight_q = userquery(q)
weight_all,q_tfidf = getweight(w,q)
h = getStories(weight_all, q_tfidf, sl)

In [119]:
for rec in h:
    for s in l['stories']:
        if rec['title'] == s['slug']:
            rec['url'] = s['url']
        else:
            continue

In [120]:
h

[{'number': 75,
  'title': '新手設計師成長日記-配色',
  'score': 0.0786686164749629,
  'url': 'http://medium.com/p/aaa2e6d149b5'},
 {'number': 127,
  'title': 'ui設計筆記工具篇-flow-神器-動態做好code-就完成',
  'score': 0.08556185657471671,
  'url': 'http://medium.com/p/5d4e456a8b8f'},
 {'number': 134,
  'title': '新手上路-ui設計師的求職問答集',
  'score': 0.08163062535461371,
  'url': 'http://medium.com/p/c767b8471064'},
 {'number': 139,
  'title': '在-sketch-中運用-iconfont',
  'score': 0.13313415067687795,
  'url': 'http://medium.com/p/4fae5b023bd'},
 {'number': 159,
  'title': '所以-你想學ux-上-ux的起源',
  'score': 0.07737379709287504,
  'url': 'http://medium.com/p/c637f33f86a4'}]

In [118]:
l['stories'][0]

{'createdAt': 1533626576651,
 'internalReferrerViews': 0,
 'creatorId': 'ca09b47c1f3e',
 'title': '使用者與產品的初次約會（上篇） — 理論與心法',
 'createdAtTime': '2018-08-07 15:22:57',
 'dataUpdateAt': '2018-08-09 11:45:18',
 'readingTime': 1.5198113207547168,
 'slug': '使用者與產品的初次約會-上篇-理論與心法',
 'firstPublishedAtBucket': 'August 2018',
 'creatorUrl': 'http://medium.com/u/ca09b47c1f3e',
 'claps': 1108,
 'tags': ['設計', '設計思考', 'ux', '产品设计', '成長駭客'],
 'postId': 'ba932933d048',
 'views': 830,
 'firstPublishedAtTime': '2018-08-07 15:58:50',
 'latestPublishedAtTime': '2018-08-08 01:54:08',
 'reads': 321,
 'upvotes': 89,
 'firstPublishedAt': 1533628729900,
 'url': 'http://medium.com/p/ba932933d048',
 'friendsLinkViews': 0,
 'collectionId': '1e6cb3374f6'}

In [95]:
with open('2018-08-09_stories_list.json','r') as f:
    l = json.load(f)
f.close()

In [113]:
l['stories'][0]

{'createdAt': 1533626576651,
 'internalReferrerViews': 0,
 'creatorId': 'ca09b47c1f3e',
 'title': '使用者與產品的初次約會（上篇） — 理論與心法',
 'createdAtTime': '2018-08-07 15:22:57',
 'dataUpdateAt': '2018-08-09 11:45:18',
 'readingTime': 1.5198113207547168,
 'slug': '使用者與產品的初次約會-上篇-理論與心法',
 'firstPublishedAtBucket': 'August 2018',
 'creatorUrl': 'http://medium.com/u/ca09b47c1f3e',
 'claps': 1108,
 'tags': ['設計', '設計思考', 'ux', '产品设计', '成長駭客'],
 'postId': 'ba932933d048',
 'views': 830,
 'firstPublishedAtTime': '2018-08-07 15:58:50',
 'latestPublishedAtTime': '2018-08-08 01:54:08',
 'reads': 321,
 'upvotes': 89,
 'firstPublishedAt': 1533628729900,
 'url': 'http://medium.com/p/ba932933d048',
 'friendsLinkViews': 0,
 'collectionId': '1e6cb3374f6'}

### 把文章詞表存到 storywordsi 的變數中
```
for i in range(3):
    filename = 'story'+ str(i+1) +'.txt'
    f = open(filename,'r')
    story = f.read()
    f.close()
    
    story = Converter('zh-hant').convert(story)
    
    wordlist = jieba.cut(story,cut_all = True)
    locals()['storywords' + str(i)] = ''
    for word in wordlist:
        if "\n" in word or word in stopwords:
            continue
        else:
            locals()['storywords' + str(i)] += word
            locals()['storywords' + str(i)] += " "
```