In [None]:
import pandas as pd

data = pd.read_excel('../data/data.xlsx')
data.head()


In [None]:
import re

data['comment'] = data['comment'].apply(lambda x: x.replace('text：',''))
data['comment'] = data['comment'].apply(lambda x: re.sub('[^\u4E00-\u9FD5,.?!，。！？、；;:：0-9]+', '', x))
data.head()

In [None]:
import matplotlib.pyplot as plt

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        # 同时显示数值和占比的饼图
        return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
    return my_autopct

num = data['target'].apply(lambda x: '积极' if x == 0 else '消极').value_counts()
plt.figure(figsize=(4,4))
plt.rcParams['font.sans-serif'] = 'Simhei'
plt.pie(num, autopct=make_autopct(num), labels=num.index)
plt.title('餐品积极/消极评论标签')
plt.show()


In [None]:
import jieba
import itertools

with open('../stopword/stopword-cn.txt','r', encoding = 'utf-8') as f:
    stopwords = f.read()
    
stopwords = stopwords.split()
stopwords.append(' ')
stopwords.append('\n')

data_neg = data[data['target'] == 1]
data_pos = data[data['target'] == 0]

data_neg_cut = data_neg['comment'].apply(jieba.lcut)
data_neg_cut = data_neg_cut.apply(lambda x : [i for i in x if i not in stopwords])
print(data_neg_cut.head())

data_pos_cut = data_pos['comment'].apply(jieba.lcut)
data_pos_cut = data_pos_cut.apply(lambda x : [i for i in x if i not in stopwords])
print(data_pos_cut.head())

In [None]:
from wordcloud import WordCloud
import numpy as np
from PIL import Image

def show(wc, fn=None):
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    if fn is not None:
        wc.to_file(fn)

freq = pd.Series(list(itertools.chain(*list(data_pos_cut)))).value_counts()
mask = np.array(Image.open('../stopword/China.jpg'))
wc = WordCloud(scale=4,
               width=2500, 
               height=3000,
               font_path='C:/Windows/Fonts/simkai.ttf',
               background_color='White', mask=mask)
wc2 = wc.fit_words(freq)
show(wc2, '../stopword/wordcloud.png')

In [None]:
freq_pos = pd.Series(list(itertools.chain(*list(data_pos_cut)))).value_counts()
freq_neg = pd.Series(list(itertools.chain(*list(data_neg_cut)))).value_counts()

wc2_pos = wc.fit_words(freq_pos)
show(wc2_pos, '../stopword/wordcloud_pos.png')

wc2_neg = wc.fit_words(freq_neg)
show(wc2_neg, '../stopword/wordcloud_neg.png')