In [30]:
# 文本处理+词云

In [None]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import jieba
import requests
from bs4 import BeautifulSoup
import re

import nltk
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

def send_request(url):
    '''
        抓取网页文本数据并返回
    '''
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"}
    # 发送url请求并获取响应文件
    html = requests.get(url, headers = headers).content

    soup = BeautifulSoup(html, "lxml")
    # 解析出所有的p标签
    content_list = soup.find_all('p')

    text = ''
    # 将p标签里的所有内容都保存到一个字符串里
    for content in content_list:
        text += content.get_text()
        text += '\n'

    return text

def deal_page(text):
    '''
        统计文本的词频
    '''

    # 利用jieba进行分词，返回所有分词后长度大于等于2 的词的列表
    seg_list = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]

    # 处理词频统计
    # nltk.FreqDist()接收一个列表，返回类字典对象
    freq_dist = nltk.FreqDist(seg_list)

    # most_common(10) ：获取词频统计排名最高的前10个
    top_word = freq_dist.most_common(10)

    print("统计高频词的排名：")
    for word, count in top_word:
        print(word + " : " + str(count))

    # return dict(top_word)
    return freq_dist

def word_could(freq_dist):
    """
        绘制词云
    """
    # 图片向量化
    background_image = np.array(Image.open("./test.jpg"))

    # 设置词云属性
    wc = WordCloud(font_path='C:\Windows\Fonts\msyh.ttc',  # 设置微软雅黑字体
                   #font_path="/Library/Fonts/Microsoft/Microsoft Yahei.ttf",
                   background_color="white",  # 背景颜色
                   max_words=200,  # 词云显示的最大词数
                   mask=background_image,  # 设置背景图片
                   max_font_size=100,  # 字体最大值
                   #min_font_size=20,
    )

    # 获取词频数据，参数为类字典对象
    wc.generate_from_frequencies(freq_dist)

    # 获取背景图片的颜色
    image_colors = ImageColorGenerator(background_image)

    plt.figure(figsize=(14, 9), dpi=100)
    plt.imshow(wc.recolor(color_func=image_colors))
    # plt.imshow(wc)

    # 不显示刻度
    plt.axis("off")
    plt.show()

    # 保存词云
    wc.to_file("word.png")        


if __name__ == "__main__":
    url = 'http://www.gov.cn/premier/2017-03/16/content_5177940.htm'
    text = send_request(url)
    freq_dist = deal_page(text)
    word_could(freq_dist)