In [None]:
import pandas as pd

data = pd.read_excel('../data/data.xlsx')
data.head()


In [None]:
import re

data['comment'] = data['comment'].apply(lambda x: x.replace('text：',''))
data['comment'] = data['comment'].apply(lambda x: re.sub('[^\u4E00-\u9FD5,.?!，。！？、；;:：0-9]+', '', x))
data.head()

In [None]:
import matplotlib.pyplot as plt

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        # 同时显示数值和占比的饼图
        return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
    return my_autopct

num = data['target'].apply(lambda x: '积极' if x == 0 else '消极').value_counts()
plt.figure(figsize=(4,4))
plt.rcParams['font.sans-serif'] = 'Simhei'
plt.pie(num, autopct=make_autopct(num), labels=num.index)
plt.title('餐品积极/消极评论标签')
plt.show()


In [None]:
import jieba
import itertools

with open('../stopword/stopword-cn.txt','r', encoding = 'utf-8') as f:
    stopwords = f.read()
    
stopwords = stopwords.split()
stopwords.append(' ')
stopwords.append('\n')

data_neg = data[data['target'] == 1]
data_pos = data[data['target'] == 0]

data_neg_cut = data_neg['comment'].apply(jieba.lcut)
data_neg_cut = data_neg_cut.apply(lambda x : [i for i in x if i not in stopwords])
print(data_neg_cut.head())

data_pos_cut = data_pos['comment'].apply(jieba.lcut)
data_pos_cut = data_pos_cut.apply(lambda x : [i for i in x if i not in stopwords])
print(data_pos_cut.head())

In [None]:
from wordcloud import WordCloud
import numpy as np
from PIL import Image

def show(wc, fn=None):
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    if fn is not None:
        wc.to_file(fn)

freq = pd.Series(list(itertools.chain(*list(data_pos_cut)))).value_counts()
mask = np.array(Image.open('../stopword/China.jpg'))
wc = WordCloud(scale=4,
               width=2500, 
               height=3000,
               font_path='C:/Windows/Fonts/simkai.ttf',
               background_color='White', mask=mask)
wc2 = wc.fit_words(freq)
show(wc2, '../stopword/wordcloud.png')

In [None]:
freq_pos = pd.Series(list(itertools.chain(*list(data_pos_cut)))).value_counts()
freq_neg = pd.Series(list(itertools.chain(*list(data_neg_cut)))).value_counts()

wc2_pos = wc.fit_words(freq_pos)
show(wc2_pos, '../stopword/wordcloud_pos.png')

wc2_neg = wc.fit_words(freq_neg)
show(wc2_neg, '../stopword/wordcloud_neg.png')

In [None]:
freq_pos[0:11]

In [None]:
comm_day_pos = data_pos['timestamp'].apply(lambda x: x.strftime('%Y-%m')).value_counts()
comm_day_pos = comm_day_pos.sort_index()
comm_day_neg = data_neg['timestamp'].apply(lambda x: x.strftime('%Y-%m')).value_counts()
comm_day_neg = comm_day_neg.sort_index()

plt.figure(figsize=(8,5))
plt.plot(range(len(comm_day_pos)), comm_day_pos, label='积极情绪评价')
plt.plot(range(len(comm_day_neg)), comm_day_neg, label='消极情绪评价')
plt.xticks(range(len(comm_day_pos)), comm_day_pos.index,rotation=45)
plt.grid()
plt.title('积极/消极评价随日期变化图')
plt.xlabel('日期')
plt.ylabel('用户评价数量')
plt.legend()


In [None]:
comm_hour_pos = pd.to_datetime(data_pos['timestamp']).apply(lambda x: x.hour).value_counts()
comm_hour_pos = comm_hour_pos.sort_index()

comm_hour_neg = pd.to_datetime(data_neg['timestamp']).apply(lambda x: x.hour).value_counts()
comm_hour_neg = comm_hour_neg.sort_index()

plt.plot(comm_hour_pos.index, comm_hour_pos, label='积极情绪评价')
plt.plot(comm_hour_neg.index, comm_hour_neg, label='消极情绪评价')
plt.title('积极/消极评价随时刻变化图')
plt.xticks(ticks=range(0,24))
plt.yticks(ticks=range(0,1000,100))
plt.xlabel('时刻')
plt.ylabel('用户评价数量')
plt.grid()
plt.legend()

In [None]:
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2.-0.3, 1.03*height, '%s' % int(height), size=18, family="Consolas")

best_sellers = data_pos['sellerId'].value_counts().nlargest(10)
autolabel(plt.bar(range(len(best_sellers[:10])), best_sellers[:10], label='积极评论数量'))
plt.xticks(range(len(best_sellers[:10])), best_sellers[:10].index, rotation=45)
plt.title('积极评论最多商家')
plt.grid()
plt.legend()


In [None]:
best_seller_comments = data_pos[data_pos['sellerId'] == 1041]['comment']
best_seller_comments_cut = best_seller_comments.apply(jieba.lcut)
best_seller_comments_cut = best_seller_comments_cut.apply(lambda x : [i for i in x if i not in stopwords])
best_seller_comments_freq = pd.Series(list(itertools.chain(*list(best_seller_comments_cut)))).value_counts()
best_seller_comments_freq
best_seller_comments_wc2 = wc.fit_words(best_seller_comments_freq)
show(best_seller_comments_wc2, '../stopword/wordcloud_best_seller.png')

In [None]:
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2.-0.3, 1.03*height, '%s' % int(height), size=18, family="Consolas")

worst_sellers = data_neg['sellerId'].value_counts().nlargest(10)
autolabel(plt.bar(range(len(worst_sellers[:10])), worst_sellers[:10], label='消极评论数量'))
plt.xticks(range(len(worst_sellers[:10])), worst_sellers[:10].index, rotation=45)
plt.title('消极评论最多商家')
plt.grid()
plt.legend()

In [None]:
worst_seller_comments = data_neg[data_neg['sellerId'] == 971]['comment']
worst_seller_comments_cut = worst_seller_comments.apply(jieba.lcut)
worst_seller_comments_cut = worst_seller_comments_cut.apply(lambda x : [i for i in x if i not in stopwords])
worst_seller_comments_freq = pd.Series(list(itertools.chain(*list(worst_seller_comments_cut)))).value_counts()
worst_seller_comments_wc2 = wc.fit_words(worst_seller_comments_freq)
show(worst_seller_comments_wc2, '../stopword/wordcloud_worst_seller.png')

In [None]:
from sklearn.model_selection import train_test_split

data_new_pos = pd.DataFrame()
data_new_pos['comment'] = data_pos['comment'].apply(lambda x: str.join('', x))
data_new_pos['label'] = 0
data_new_pos.reset_index(inplace=True,drop=True)

data_new_neg = pd.DataFrame()
data_new_neg['comment'] = data_neg['comment'].apply(lambda x: str.join('', x))
data_new_neg['label'] = 1
data_new_neg.reset_index(inplace=True,drop=True)

data_new = pd.concat([data_new_pos,data_new_neg],axis=0)
data_new.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
data_new['comment'] = data_new['comment'].apply(jieba.lcut)
data_new['comment'] = data_new['comment'].apply(lambda x: str.join(' ', x))

In [None]:
X_comments = vect.fit_transform(data_new['comment']).toarray()
Y_label = data_new['label']
test_ratio = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X_comments, Y_label, test_size=test_ratio, random_state=1)


In [None]:
X_test.shape, Y_test.shape

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

mlp = MLPClassifier(max_iter = 3)
mlp.fit(X_train, Y_train)
Y_pred = mlp.predict(X_test)

score = accuracy_score(Y_pred, Y_test)
score

In [None]:
from sklearn.metrics import accuracy_score
import math

y_pred = Y_pred.astype(np.int_)
y_test = Y_test.astype(np.int_)
tp = sum(y_pred & y_test)
fp = sum((y_pred == 1) & (y_test == 0))
tn = sum((y_pred == 0) & (y_test == 0))
fn = sum((y_pred == 0) & (y_test == 1))
print('TP = %s, FP = %s, TN = %s, FN = %s' % (tp, fp, tn, fn))

In [None]:
accuracy = accuracy_score(Y_pred, Y_test)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
mcc = (tp*tn - fp*fn)/ math.sqrt((tp+fp) * (tp + fn) *(tn+fp) *(tn+fn))

print('Accuracy = %s, Precision = %s, Recall = %s, F1_score = %s, MCC = %s' %(accuracy, precision, recall, f1_score, mcc))

In [None]:
data_test = pd.read_excel('../data/test.xlsx')
comment_origin = data_test['comment'].copy()
data_test['comment'] = data_test['comment'].apply(lambda x: x.replace('text：',''))
data_test['comment'] = data_test['comment'].apply(lambda x: re.sub('[^\u4E00-\u9FD5,.?!，。！？、；;:：0-9]+', '', x))

data_test['comment'] = data_test['comment'].apply(jieba.lcut)
data_test['comment'] = data_test['comment'].apply(lambda x: str.join(' ', x))
data_test.head()

In [None]:
X_comments_test = vect.transform(data_test['comment']).toarray()
Y_pred_test = mlp.predict(X_comments_test)

In [None]:
data_test['target'] = Y_pred_test
data_test['comment'] = comment_origin
data_test.to_excel('../data/test_out.xlsx', index=False)