In [85]:
import requests
import pickle
import numpy as np
import json
import jieba

class Ltn_crawler(object):
    def __init__(self):
        self.category = ['politics', 'society', 'life', 'world', 'novelty']
        self.url = "https://news.ltn.com.tw/ajax/breakingnews/{category}/{page}"
    def request_url(self,url):
        req = requests.get(url)
        content = req.content.decode("UTF-8")
        json_content = json.loads(content)
        return json_content
    def crawl_page(self):
        all = []
        for category in self.category:
            for i in range(1, 26):
                page_news = self.request_url(self.url.format(category=category, page=i))
                if page_news['data']:
                    if type(page_news['data']) == list:
                        all.extend(page_news['data'])
                    else:
                        all.extend(list(page_news['data'].values()))
        return all

## 爬取相關資料

In [61]:
crawler = Ltn_crawler()
news_data = crawler.crawl_page()

In [71]:
cut_word = [[word for word in jieba.cut(p['title'], cut_all=False)] for p in news_data]

In [151]:
word_set = sorted(list(set([word for article in cut_word for word in article])))

In [154]:
word_freq = sorted([(word,[d for p in cut_word for d in p].count(word)) for word in word_set], key=lambda x: x[1], reverse=True)

In [155]:
word_freq

[(' ', 2260),
 ('「', 539),
 ('」', 539),
 ('：', 474),
 ('！', 370),
 ('、', 180),
 ('？', 152),
 ('》', 146),
 ('被', 125),
 ('長', 107),
 ('的', 95),
 ('人', 91),
 ('2', 79),
 ('了', 79),
 ('網友', 77),
 ('中國', 75),
 ('年', 69),
 ('不', 67),
 ('在', 65),
 ('是', 64),
 ('6', 63),
 ('陳', 63),
 ('3', 60),
 ('中', 60),
 ('韓國瑜', 58),
 ('月', 55),
 ('1', 52),
 ('...', 50),
 ('武漢', 50),
 ('肺炎', 49),
 ('香港', 49),
 ('/', 48),
 ('補選', 48),
 ('高雄市', 48),
 ('5', 47),
 ('台', 47),
 ('與', 47),
 ('4', 46),
 ('台灣', 46),
 ('萬', 46),
 ('對', 45),
 ('他', 43),
 ('我', 42),
 ('高市', 42),
 ('7', 39),
 ('8', 39),
 ('大', 39),
 ('後', 39),
 ('男', 39),
 ('︰', 39),
 ('再', 38),
 ('源', 38),
 ('10', 37),
 ('《', 36),
 ('到', 36),
 ('復', 36),
 ('軍', 36),
 ('黃', 36),
 ('國民黨', 35),
 ('有', 34),
 ('會', 33),
 ('疫情', 33),
 ('警察', 33),
 ('遭', 33),
 ('高雄', 33),
 ('就', 32),
 ('美國', 32),
 ('讓', 32),
 ('韓粉', 32),
 ('日', 30),
 ('日本', 30),
 ('美', 30),
 ('\r', 29),
 ('也', 29),
 ('嗆', 29),
 ('9', 28),
 ('台南', 28),
 ('崑', 28),
 ('議員', 28),
 ('軍情', 28),
 (

## 文字-文章向量表示

In [90]:
all_article_vector = []
for cut_article in cut_word:
    vector = []
    for word in word_set:
        vector.append(cut_article.count(word))
    all_article_vector.append(vector)
all_article_vector = np.array(all_article_vector)

## 經由將文章轉換為向量後，透過cosine similarity 計算文章相似度

In [119]:
from sklearn.metrics.pairwise import cosine_similarity

def article_title_similarity(article_index, top=5):
    result = [(i, cosine_similarity([all_article_vector[article_index]],[article_vec])) for i, article_vec in enumerate(all_article_vector)]
    answer = sorted(result, key=lambda x:x[1], reverse=True)[:top]
    answer = [(news_data[p[0]]['title'],p[1]) for p in answer]
    return "選擇標題：{} \n推薦選項：\n{}".format(answer[0][0], '\n'.join([' 相似度：'.join([p[0], str(p[1][0][0])]) for p in answer[1:]]))

In [120]:
print(article_title_similarity(50))

選擇標題：韓國瑜解職 新聞局粉專LOGO換回前朝「彩帶高」 
推薦選項：
韓國瑜「真愛高雄  珍重再見」音樂會 湧入逾8千粉絲 相似度：0.3779644730092272
中壢突冒「天坑」 轎車慘變「卡車」 相似度：0.372677996249965
近千韓粉頂大太陽守候韓國瑜「畢業典禮」 聽「放心去飛」想哭 相似度：0.3611575592573077
韓國瑜今天「畢業音樂會」 一早韓粉先卡位 相似度：0.3481553119113957


## 透過文章的關係，來做詞相似度

In [129]:
def get_index_vec(index, vec):
    return [p[index] for p in vec]

def word_similarity(word="柯p", top=5):
    if word not in word_set:
        print("沒有這個詞")
        return None
    index = word_set.index(word)
    word_vec = [p[index] for p in all_article_vector]
    result = [(i, cosine_similarity([word_vec],[get_index_vec(i, all_article_vector)])) for i in range(len(word_set))]
    answer = sorted(result, key=lambda x:x[1], reverse=True)[:top]
    answer = [(word_set[p[0]],p[1]) for p in answer]
    print(answer)

In [131]:
word_similarity(word="中國學生")

[('中國學生', array([[1.]])), ('工具', array([[1.]])), ('當作', array([[1.]])), ('英議員', array([[1.]])), ('提議', array([[0.57735027]]))]


## 透過字詞間先後關係來表示向量

In [148]:
def get_word_index(word):
    return word_set.index(word)
all_word_vec = []
for word in word_set:
    vec = [0] * len(word_set)
    result = [get_word_index(cut_word[a][i+1]) for a, p in enumerate(cut_word) for i, d in enumerate(p) if d == word and i < len(p) - 1]
    result = [(word, result.count(word)) for word in set(result)]
    for index, count in result:
        vec[index] = count
    all_word_vec.append(vec)


In [149]:
def vec_word_similarity(word="你", top=5):
    if word not in word_set:
        print("沒有這個詞")
        return None
    index = word_set.index(word)
    word_vec = all_word_vec[index]
    result = [(i, cosine_similarity([word_vec],[all_word_vec[i]])) for i in range(len(word_set))]
    answer = sorted(result, key=lambda x:x[1], reverse=True)[:top]
    answer = [(word_set[p[0]],p[1]) for p in answer]
    print(answer)

In [156]:
vec_word_similarity(word="中國")

[('中國', array([[1.]])), ('日環食', array([[0.38924947]])), ('單', array([[0.34465617]])), ('福', array([[0.34377583]])), ('反共', array([[0.34112115]]))]


## TF-IDF

In [160]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([' '.join(p) for p in cut_word])
tfidf = transformer.fit_transform(X)

In [161]:
print(tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [165]:
def article_tfidf_title_similarity(article_index, top=5):
    result = [(i, cosine_similarity([tfidf.toarray()[article_index]],[article_vec])) for i, article_vec in enumerate(tfidf.toarray())]
    answer = sorted(result, key=lambda x:x[1], reverse=True)[:top]
    answer = [(news_data[p[0]]['title'],p[1]) for p in answer]
    return "選擇標題：{} \n推薦選項：\n{}".format(answer[0][0], '\n'.join([' 相似度：'.join([p[0], str(p[1][0][0])]) for p in answer[1:]]))

In [166]:
print(article_tfidf_title_similarity(50))

選擇標題：韓國瑜解職 新聞局粉專LOGO換回前朝「彩帶高」 
推薦選項：
解職後首日  韓國瑜搭友人車6度弔唁許崑源 相似度：0.2245385234405864
解職首日 韓國瑜第六度弔唁許崑源 相似度：0.2158675899599956
韓國瑜解職首日 8點8分搭車外出 相似度：0.20797013294255576
韓國瑜被罷免解職 國民黨議員李雅靜表態參加高市長補選 相似度：0.17789804868164724
