## 如何通过词向量获取相近意思的单词

In [3]:
"""
读取文件并替换掉NA
"""

import pandas as pd
csv_path = '/data/tusers/lixiangr/caill/NLP/data/sqlResult_1558435.csv'
content = pd.read_csv(csv_path, encoding = 'gb18030')
content = content.fillna("")
content.head()
content.columns.tolist()
news_content = content["content"].tolist()

In [None]:
"""
定义正则表达式匹配（为了去除特殊符号）以及结巴分词的函数
/\w/g匹配所有的阿拉伯数字、大小写字母、下划线；
/\d/g匹配数字
"""

import jieba
def cut(string): return " ".join(jieba.cut(string))
cut("这是一个测试例子，结果怎么样")

import re
def token(string): return re.findall("[\d|\w]+", string)
token("token\('这是一个测试\n\n\n'\)")

In [None]:
"""
去除特殊符号，如"(", ")", ",", "\r", "\n"
将结巴分词得到的list进行连接得到纯文本
"""

news_content = [token(a) for a in news_content]
news_content = [' '.join(a) for a in news_content]
news_content = [cut(a) for a in news_content]

with open("/data/tusers/lixiangr/caill/NLP/data/news-sentences-cut.txt", "w") as f:
    for n in news_content:
        f.write(n + "\n")

In [None]:
"""
Word2Vec的参数：
size: 是指输出的词的向量维数，默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
workers: 参数控制训练的并行数。
"""

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

news_word2vec = Word2Vec(LineSentence("/data/tusers/lixiangr/caill/NLP/data/news-sentences-cut.txt"), size = 35, workers = 8)
len(news_word2vec.wv.vocab)

news_word2vec.most_similar("葡萄牙", topn = 20)
news_word2vec.most_similar("捷克", topn = 20)
news_word2vec.most_similar("说", topn = 20)
news_word2vec.most_similar("怒斥", topn = 20)
news_word2vec.most_similar("认为", topn = 20)

#### 数据越多，效果就会越好

In [None]:
from collections import defaultdict

# 可以使用动态规划优化计算速度
def get_related_words(initial_words, model):
    
    unseen = initial_words
    seen = defaultdict(int)
    
    # 可以更大，控制寻找的相近词的数量
    max_size = 500
    
    while unseen and len(seen) < max_size:
        if len(seen) % 50 == 0: print('seen length : {}'.format(len(seen)))
            
        node = initial_words.pop(0)
        new_expanding = [s for s, t in news_word2vec.most_similar(node, topn = 20)]
        
        unseen += new_expanding
        seen[node] += 1 # 这个1可以修改
    
    return seen

In [None]:
related_words = get_related_words(["说", "表示"], news_word2vec)
sorted(related_words.items(), key = lambda x: x[1], reverse = True)

## TF-IDF关键词

In [None]:
import math

# 某一个单词在多少个文本中出现，出现这个单词的文本越多那个tf-idf越大
def document_frequency(word): 
    return sum(1 for n in news_content if word in n)

# 总的文本数与出现这个单词的文本数的比值
def idf(word):
    """Gets the inversed document frequency"""
    return math.log10(len(news_content) / document_frequency(word))

# 某个单词在某个文本中的出现次数
def tf(word, document):
    """
    Gets the term frequemcy of a @word in a @document.
    """
    words = document.split()
    
    return sum(1 for w in words if w == word)

"""
某个单词在目标文本中的出现次数（越多越重要）*总的文本数与出现这个单词的文本数的比值（越大就代表出现某个单词的文本数少）
返回的是目标文本中每一个单词的tf-idf值
"""
def get_keywords_of_a_ducment(document):
    words = set(document.split())
    
    tfidf = [
        (w, tf(w, document) * idf(w)) for w in words
    ]
    
    tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
    
    return tfidf


document_frequency('的')
news_content[0]
idf('的') < idf('小米')
content['content'][11]
tf('银行', news_content[11])
tf('创业板', news_content[11])
idf('创业板')
idf('银行')
idf('短期')
tf('短期', news_content[11])

news_content[0]
news_content[11]
%prun get_keywords_of_a_ducment(news_content[0])
machine_new_keywords = get_keywords_of_a_ducment(news_content[101])
news_content[101]

## 词云(WordCloud)

In [None]:
import wordcloud
import matplotlib.pyplot as plt
%matplotlib inline

# we could download the font from https://github.com/Computing-Intelligence/datasource
wc = wordcloud.WordCloud('/data/tusers/lixiangr/caill/NLP/data/datasource/SourceHanSerifSC-Regular.otf')

news_content[4]
help(wc.generate_from_frequencies)
machine_new_keywords_dict = {w: score for w, score in machine_new_keywords}
plt.imshow(wc.generate_from_frequencies(machine_new_keywords_dict))
shenzhen_social_news = get_keywords_of_a_ducment(news_content[4])
shenzhen_social_news

In [None]:
from PIL import Image
import numpy as np
police_mask = np.array(Image.open('/data/tusers/lixiangr/caill/NLP/week5_recode/wordvec.png'))
wordcloud_with_mask = wordcloud.WordCloud(font_path='/data/tusers/lixiangr/caill/NLP/data/datasource/SourceHanSerifSC-Regular.otf', mask=police_mask)

plt.switch_backend('agg')
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
plt.imshow(wc.generate_from_frequencies({w:s for w, s in shenzhen_social_news[:20]}))
plt.savefig("/data/tusers/lixiangr/caill/NLP/week5_recode/wordcloud_1.pdf")
plt.close()
    
plt.switch_backend('agg')
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
plt.imshow(wordcloud_with_mask.generate_from_frequencies({w:s for w, s in shenzhen_social_news[:20]}))
plt.savefig("/data/tusers/lixiangr/caill/NLP/week5_recode/wordcloud_2.pdf")
plt.close()

## TF-IDF向量化

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

vectorized = TfidfVectorizer(max_features = 10000)
vectorized.vocabulary_

sample_num = 5000
sub_samples = news_content[:sample_num]
X = vectorized.fit_transform(sub_samples)
X.shape

# 获得值不为0的位置
np.where(X[0].toarray())

document_id_1, document_id_2 = random.randint(0, 1000), random.randint(0, 1000)
news_content[document_id_1]
news_content[document_id_2]

vector_of_d_1 = X[document_id_1].toarray()[0]
vector_of_d_2 = X[document_id_2].toarray()[0]

random_choose = random.randint(0, 1000)
news_content[random_choose]

In [None]:
from scipy.spatial.distance  import cosine

def distance(v1, v2): return cosine(v1, v2)

distance([1, 1], [2, 2])

distance(X[random_choose].toarray()[0], X[document_id_1].toarray()[0])
distance(X[random_choose].toarray()[0], X[document_id_2].toarray()[0])

sorted(list(range(5000)), key = lambda i: distance(X[random_choose].toarray()[0], X[i].toarray()[0]))

# bin() 返回一个整数 int 或者长整数 long int 的二进制表示。
bin(49 & 38)

## 建立搜索引擎
- Input: Words
- Output: Documents

In [None]:
def naive_search(keywords):
    news_ids = [i for i, n in enumerate(news_content) if all(w in n for w in keywords)]
    return news_ids
    # O(D * w) 

news_ids = naive_search('美军 司令 航母'.split())

### Input word -> the documents which contain this word

In [None]:
X.shape
transposed_x = X.transpose().toarray()
word_2_id = vectorized.vocabulary_
word_2_id['今天']
id_2_word = {d: w for w, d in word_2_id.items()}
set(np.where(transposed_x[6195])[0])
word_2_id['美军']
word_2_id['司令']
usa_force = set(np.where(transposed_x[7922])[0])
commander = set(np.where(transposed_x[2769])[0])
usa_force & commander
from functools import reduce
d1, d2, d3 = {1, 2, 3}, {4, 5, 6, 3, 2}, {1, 3, 4}
from operator import and_
reduce(and_, [d1, d2, d3])

def search_engine(query):
    """
    @query is the searched words, splited by space
    @return is the related documents which ranked by tfidf similarity
    """
    words = query.split()
    query_vec = vectorized.transform([' '.join(words)]).toarray()[0]
    candidates_ids = [word_2_id[w] for w in words]
    documents_ids = [
         set(np.where(transposed_x[_id])[0]) for _id in candidates_ids
    ]
    merged_documents = reduce(and_, documents_ids)
    # we could know the documents which contain these words
    sorted_docuemtns_id = sorted(merged_documents, key=lambda i: distance(query_vec, X[i].toarray()))
    return sorted_docuemtns_id

np.where(vectorized.transform(['美联储 加息 次数']).toarray()[0])
text = """新华社洛杉矶４月８日电（记者黄恒）美国第三舰队８日发布声明说，该舰队下属的“卡尔·文森”航母战斗群当天离开新加坡，改变原定驶往澳大利亚的任务计划，转而北上，前往西太平洋朝鲜半岛附近水域展开行动。\n　　该舰队网站主页发布的消息说，美军太平洋司令部司令哈里·哈里斯指示“卡尔·文森”航母战斗群向北航行。这一战斗群包括“卡尔·文森”号航空母舰、海军第二航空队、两艘“阿利·伯克”级导弹驱逐舰和一艘“泰孔德罗加”级导弹巡洋舰。\n　　“卡尔·文森”号航母的母港位于美国加利福尼亚州的圣迭戈，今年１月初前往西太平洋地区执行任务，并参与了日本及韩国的军事演习。\n　　美国有线电视新闻网援引美国军方官员的话说，“‘卡尔·文森’号此次行动是为了对近期朝鲜的挑衅行为作出回应”。（完）"""
print(text)

import re
text = """美国有线电视新闻网援引美国军方官员的话说"""
pat = r'(新闻|官员)'
re.compile(pat).sub(repl="**\g<1>**", string=text)
def get_query_pat(query):
    return re.compile('({})'.format('|'.join(query.split())))

get_query_pat('美军 司令 航母')

def highlight_keywords(pat, document):
    return pat.sub(repl="**\g<1>**", string=document) 

highlight_keywords(get_query_pat('美军 司令 航母'), content['content'][22987])

from IPython.display import display, Markdown

def search_engine_with_pretty_print(query):
    candidates_ids = search_engine(query)
    for i, _id in enumerate(candidates_ids):
        title = '## Search Result {}'.format(i)
        c = content['content'][_id]
        c = highlight_keywords(get_query_pat(query), c)    
        
        display(Markdown(title + '\n' + c))
        
search_engine_with_pretty_print('春节 假期')
search_engine()
#%%timeit
search_engine('美联储 加息 次数')

content['content'][2189]

## PageRank

In [None]:
import networkx as nx
import random
from string import ascii_uppercase
ascii_uppercase

def genearte_random_website():
    return ''.join([random.choice(ascii_uppercase) for _ in range(random.randint(3, 5))]) + '.'  + random.choice(['com', 'cn', 'net'])

genearte_random_website()
websites = [genearte_random_website() for _ in range(25)]

websites

random.sample(websites, 10)

website_connection = {
    websites[0]: random.sample(websites, 10),
    websites[1]: random.sample(websites, 5),
    websites[3]: random.sample(websites, 7),
    websites[4]: random.sample(websites, 2),
    websites[5]: random.sample(websites, 1),
}

website_network = nx.graph.Graph(website_connection)
plt.figure(3,figsize=(12,12))
nx.draw_networkx(website_network, font_size=10)

sorted(nx.pagerank(website_network).items(),key=lambda x: x[1], reverse=True)