In [156]:
import pandas as pd
import jieba.posseg as pseg
import jieba
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF
import requests
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import math
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import roc_curve

import os

%matplotlib inline

In [157]:
!pip install opencc



In [158]:
#importlib.reload(sys)
#sys.setdefaultencoding('utf8')
#Since the default on Python 3 is UTF-8 already, there is no point in leaving those statements in.

In [159]:
stopwords = list()
for file in os.listdir("./stop_common/"):
    if ".DS_" not in file:
        with open("./stop_common/" + file, "r", encoding="utf-8") as f:
            for line in f.readlines():
                stopwords.append(line.strip())
print(len(stopwords))

# 对停用词表进行去重：求集合（去重），再转换为列表
stopwords = list(set(stopwords))
print(len(stopwords))

7935
2431


In [160]:
# 构建自定义词典
selfdict = list()
with open("./selfdefineddict.txt", "r", encoding = 'utf-8') as f:
    for line in f.readlines():
        selfdict.append(line.strip())
print(len(selfdict))

selfdict = list(set(selfdict))
print(len(selfdict))
with open("./selfdefineddict_new.txt", "w", encoding = "utf-8") as t:
    for word in selfdict:
        t.write(word + " n" + "\n")

# 引入自定义词表，需要注意：使用 jieba 自带的并行分词模式（enable_parallel）时，自定义词表会失效
# 如果要提升分词效率，请自己设计多线程
jieba.load_userdict("./selfdefineddict_new.txt")

435
406


In [161]:
# 定义分词函数
def tokenize(text):
    kept_words = list()
    for word, flag in pseg.cut(text):
        # 过滤停用词和词数小于1的词
        if (word not in stopwords) and \
        (len(word) > 1):
            kept_words.append(word)
    return " ".join(kept_words)

In [162]:
# 读取训练集并进行清洗
# 修改训练集标注方式
codesample = pd.read_csv('./mytrainingset_3240.csv')
try:
    codesample['parsed']
except:
    codesample['parsed'] = codesample.content.apply(tokenize)

codesample['attitude'] = codesample['attitude'].astype(str)
codesample.head()

Unnamed: 0.1,Unnamed: 0,recordid,content,attitude,parsed
0,1.0,221591.0,本来气候问题就是忽悠发展中国家去工业化的(╯3╰),2,本来 气候 忽悠 发展中国家 去工业化
1,2.0,221595.0,你们不觉得空气质量越来越差了吗……？,1,空气质量 越来越差
2,3.0,221603.0,这个法国总统就是垃圾。试目以待。现在看起来支持的人最多。但他走的路可并不是光明的。,4,法国 总统 垃圾 试目 以待 支持 光明
3,4.0,221605.0,不懂弹幕为啥嘲讽马克龙，要是勒庞那个极右上台，你以为黄种人对他们来说跟穆斯林有区别么，法国的...,4,弹幕 为啥 嘲讽 马克龙 勒庞 极右 上台 黄种人 穆斯林 区别 法国 亚裔 遭殃
4,5.0,221606.0,(-_-#)美国人的确需要环保了，人均资源消耗量是中国的数倍,3,美国 环保 人均 资源 消耗量 中国 数倍


In [163]:
# 将训练集文本向量化
cv = CountVectorizer()
codesample['parsed'] = codesample.parsed.fillna('')
X = cv.fit_transform(codesample.parsed)

In [164]:
# 读取全样本数据
full = pd.read_csv('comment_merge.csv')
full.rename(columns = {"评论内容": "comment_content", "回复内容": "reply_content"}, inplace = True)
full.head()

Unnamed: 0,recordid,<fullpath>,<realpath>,<pageno>,<createdate>,videoname,一级分类,二级分类,up主,播放数,...,视频发布时间,comment_content,评论作者,评论时间,评论点赞数,来自客户端,reply_content,回复时间,回复作者,点赞数
0,380465,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,2020-01-07 15:23:34,派出30名消防员把我给整乐了加拿大,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,塑料英联邦情,2020-01-07 19:48,土拨鼠暴打藏狐,140
1,380466,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,2020-01-07 15:23:34,派出30名消防员把我给整乐了加拿大,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,说不定是派了30名公款旅游的大爷来的。,2020-01-07 20:55,伪君子-真小人,129
2,380467,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,2020-01-07 15:23:34,派出30名消防员把我给整乐了加拿大,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,可能加拿大也就三十多个消防员呢,2020-01-07 18:52,二次元勋宗,117
3,380468,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,2020-01-07 15:23:34,几小时后的新闻：澳大利亚最大军舰被山火吞没,珍珠奶茶李建勋,2020-01-07 15:24,2354.0,,你说我一海军舰艇怎么就被山火吞没了呢，咱也不知道，咱也不敢问,2020-01-07 20:25,正在走程序的阿伟,142
4,380469,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,2020-01-07 15:23:34,几小时后的新闻：澳大利亚最大军舰被山火吞没,珍珠奶茶李建勋,2020-01-07 15:24,2354.0,,这是两栖登陆舰，所以上岸了,2020-01-07 21:11,老实芭蕉的陈先生,52


In [80]:
#import random

#df = pd.read_csv('comment_merge.csv')
#c = random.sample(range(1, 126851), 2000)
#c.sort()
#df.iloc[c]
#df.iloc[c].to_csv('./randomsample2.csv', sep = ',')

In [165]:
# 处理全样本中的缺失值
full['comment_content'] = full['comment_content'].fillna('')
full['reply_content'] = full['reply_content'].fillna('')

# 清洗全样本中的评论和回复文本
full['comment_content_parsed'] = full.comment_content.apply(tokenize)
full['reply_content_parsed'] = full.reply_content.apply(tokenize)

In [166]:
# 保存清洗和分词之后的全样本
full.to_csv('comment_merge_parsed.csv', index = None, encoding = 'utf-8')

In [134]:
full = pd.read_csv('comment_merge_parsed.csv')
full.head()

Unnamed: 0,recordid,<fullpath>,<realpath>,<pageno>,<createdate>,videoname,一级分类,二级分类,up主,播放数,...,评论作者,评论时间,评论点赞数,来自客户端,reply_content,回复时间,回复作者,点赞数,comment_content_parsed,reply_content_parsed
0,380465,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,塑料英联邦情,2020-01-07 19:48,土拨鼠暴打藏狐,140,派出 30 消防员 整乐 加拿大,塑料 英联邦
1,380466,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,说不定是派了30名公款旅游的大爷来的。,2020-01-07 20:55,伪君子-真小人,129,派出 30 消防员 整乐 加拿大,说不定 30 公款 旅游 大爷
2,380467,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,做一个受小姐姐欢迎的人,2020-01-07 15:57,2408.0,,可能加拿大也就三十多个消防员呢,2020-01-07 18:52,二次元勋宗,117,派出 30 消防员 整乐 加拿大,加拿大 三十多个 消防员
3,380468,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,珍珠奶茶李建勋,2020-01-07 15:24,2354.0,,你说我一海军舰艇怎么就被山火吞没了呢，咱也不知道，咱也不敢问,2020-01-07 20:25,正在走程序的阿伟,142,几小时 新闻 澳大利亚 军舰 山火 吞没,海军 舰艇 山火 吞没
4,380469,https://www.bilibili.com/video/BV1cJ41157FP?fr...,https://www.bilibili.com/video/BV1cJ41157FP?fr...,0,2021-12-26 12:55:09.0,来了！澳大利亚最大军舰驶入火场,,,环球网\n \n,15.9万播放 ·,...,珍珠奶茶李建勋,2020-01-07 15:24,2354.0,,这是两栖登陆舰，所以上岸了,2020-01-07 21:11,老实芭蕉的陈先生,52,几小时 新闻 澳大利亚 军舰 山火 吞没,两栖 登陆舰 上岸


In [167]:
# 处理清洗和分词之后的全样本文本缺失值并向量化
full['comment_content_parsed'] = full.comment_content_parsed.fillna('')
full['reply_content_parsed'] = full.reply_content_parsed.fillna('')

dtm_full_1 = cv.transform(full['comment_content_parsed'])
dtm_full_2 = cv.transform(full['reply_content_parsed'])

In [171]:
model = OneVsRestClassifier(LogisticRegression(), n_jobs=2)

# 学习并预测comment attitude
y = codesample.attitude
model.fit(X, y)
full['predict_comment_attitude'] = model.predict(dtm_full_1)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.49691358 0.49691358 0.44444444 0.48611111 0.54095827]
[0.53395062 0.50154321 0.44907407 0.50617284 0.53323029]
[0.5        0.49074074 0.45679012 0.48302469 0.51777434]
[0.45833333 0.42746914 0.41820988 0.46296296 0.4374034 ]


In [172]:
model = OneVsRestClassifier(LogisticRegression(), n_jobs=2)

# 学习并预测comment attitude
y = codesample.attitude
model.fit(X, y)
full['predict_reply_attitude'] = model.predict(dtm_full_2)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.49228395 0.4845679  0.45216049 0.47685185 0.5301391 ]
[0.53395062 0.50154321 0.44907407 0.50617284 0.53323029]
[0.5        0.49074074 0.45679012 0.48302469 0.51777434]
[0.45833333 0.42746914 0.41820988 0.46296296 0.4374034 ]


In [173]:
# 保存全样本预测结果
full.to_csv('comment_merge_predict9.csv', index = None, encoding = 'gb18030')

In [14]:
# 学习并预测态度a2:sceptics
y = codesample.a2
model.fit(X, y)
full['predict_comment_a2'] = model.predict(dtm_full_1)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_)), X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TypeError: cross_val_score() missing 1 required positional argument: 'X'

In [52]:
# 学习并预测态度a3:neutral
y = codesample.a3
model.fit(X, y)
full['predict_comment_a3'] = model.predict(dtm_full_1)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.82833333 0.815      0.82166667 0.79833333 0.83      ]
[0.83666667 0.82       0.82       0.80666667 0.83666667]
[0.84166667 0.83833333 0.83666667 0.83666667 0.83833333]
[0.80333333 0.79166667 0.79833333 0.77166667 0.80333333]


In [53]:
# 学习并预测态度a4:none
y = codesample.a4
model.fit(X, y)
full['predict_comment_a4'] = model.predict(dtm_full_1)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.78833333 0.74833333 0.73333333 0.73       0.725     ]
[0.79166667 0.77833333 0.72833333 0.735      0.745     ]
[0.69833333 0.71       0.69666667 0.69333333 0.70833333]
[0.73       0.725      0.71       0.715      0.71833333]


In [54]:
# 学习并预测reply态度a1:activist
y = codesample.a1
model.fit(X, y)
full['predict_reply_a1'] = model.predict(dtm_full_2)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.76833333 0.68333333 0.75333333 0.69166667 0.75      ]
[0.77833333 0.73166667 0.76833333 0.7        0.78833333]
[0.74666667 0.72666667 0.73333333 0.72666667 0.74166667]
[0.735      0.68       0.72333333 0.69333333 0.73166667]


In [55]:
# 学习并预测reply态度a2:sceptic
y = codesample.a2
model.fit(X, y)
full['predict_reply_a2'] = model.predict(dtm_full_2)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))



[0.74333333 0.795      0.73833333 0.75666667 0.78333333]




[0.755      0.78333333 0.765      0.755      0.78666667]




[0.76       0.76166667 0.755      0.755      0.76      ]
[0.70833333 0.71333333 0.72       0.70666667 0.72166667]




In [56]:
# 学习并预测reply态度a3:neutral
y = codesample.a3
model.fit(X, y)
full['predict_reply_a3'] = model.predict(dtm_full_2)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.82666667 0.81833333 0.81333333 0.795      0.835     ]
[0.83666667 0.82       0.82       0.80666667 0.83666667]
[0.84166667 0.83833333 0.83666667 0.83666667 0.83833333]
[0.80333333 0.79166667 0.79833333 0.77166667 0.80333333]


In [57]:
# 学习并预测reply态度a4:none
y = codesample.a4
model.fit(X, y)
full['predict_reply_a4'] = model.predict(dtm_full_2)
for model_ in [RandomForestClassifier(), LogisticRegression(), SVC(), BernoulliNB()]:
    print(cross_val_score(OneVsRestClassifier(model_), X, y))

[0.795      0.75166667 0.73833333 0.73166667 0.72666667]
[0.79166667 0.77833333 0.72833333 0.735      0.745     ]
[0.69833333 0.71       0.69666667 0.69333333 0.70833333]
[0.73       0.725      0.71       0.715      0.71833333]


In [58]:
# 保存全样本预测结果
full.to_csv('comment_merge_predict2.csv', index = None, encoding = 'gb18030')