# 道德词典

In [1]:
import pandas as pd     # 数据表
import numpy as np     # 数组
import jieba     # 中文分词
import moralstrength     # 英文道德词典
from moralstrength import lexicon_use
from moralstrength.moralstrength import estimate_morals
import cmfd     # 中文道德词典

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kk/ylyfvmrj6zv853wrvp3s_0180000gn/T/jieba.cache
Loading model cost 0.321 seconds.
Prefix dict has been built successfully.


## 上课案例

In [2]:
df_eng = pd.read_excel('text_analysis_twitter_sample.xlsx', index_col = 0)

In [7]:
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client


In [8]:
df_chn = pd.read_excel('text_analysis_ad.xlsx', index_col = 0)

In [9]:
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟"""
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也'
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花"""
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片"""
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友"""


### 英文语料

In [10]:
lexicon_use.select_version('latest')

In [11]:
df_eng_morals = estimate_morals(df_eng['text'].tolist(), process=True)
df_eng = pd.concat([df_eng, df_eng_morals], axis=1)



In [12]:
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source,care,fairness,loyalty,authority,purity
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone,,,,,
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client,,,,,
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client,,,,,
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client,4.0,,,,
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client,,,,,


### 中文语料

In [13]:
chn_moral = pd.read_csv(r'https://raw.githubusercontent.com/CivicTechLab/CMFD/main/cmfd_civictech.csv')

In [14]:
chn_moral

Unnamed: 0,chinese,foundation
0,同情,care
1,一臂之力,care
2,一见倾心,care
3,三个代表,care
4,上阵杀敌,care
...,...,...
6133,随和,general
6134,雅正,general
6135,雷打不动,general
6136,马马虎虎,general


In [15]:
moral_dict = chn_moral.groupby('foundation')['chinese'].apply(list).to_dict()

In [16]:
def moral_quantity(text):
    
    if isinstance(text, str):
        moral_word_total = 0
        moral_word = {}
        moral_num = {}

        for key in moral_dict.keys():
            moral_word[key] = []
        for word in jieba.cut(text):
            for key in moral_dict.keys():
                if word in moral_dict[key]:
                    moral_word[key].append(word)

        for key in moral_word.keys():
            moral_word_total += len(moral_word[key])
        if moral_word_total == 0:
            return None

        for key in moral_word.keys():
            moral_num[key] = len(moral_word[key]) / moral_word_total

    return moral_num

In [17]:
text_test = """在一个古老的王国里，有一位年轻的王子，名叫艾伦。他爱上了王国中一个普通的女孩，名叫艾丽丝。然而，王子的父亲，即国王，却不同意这段感情，因为他认为艾丽丝的出身不够高贵，不适合成为王室的一员。

王子面临着艰难的选择，他可以违抗父王的意愿和艾丽丝在一起，但这意味着他要放弃王位和王国的责任。另一方面，他也可以遵从父王的意愿，选择一个地位高贵但他并不爱的女子，这样可以继续承担王国的责任。

在经过深思熟虑后，王子决定选择违抗父王的意愿，选择与艾丽丝在一起。他认为爱情比地位和财富更重要，而且他也相信只有在爱情的支持下，他才能成为一个真正的幸福的国王。

尽管他的决定引起了一些争议和反对，但王子坚持了自己的选择，并与艾丽丝结为夫妻。他们的爱情和幸福成为了王国中的典范，人们开始理解和尊重不同阶层之间的爱情，认识到爱情是超越地位和财富的。"""

In [18]:
moral_quantity(text_test)

{'altr': 0.0,
 'auth': 0.5161290322580645,
 'care': 0.06451612903225806,
 'dili': 0.0,
 'fair': 0.0,
 'general': 0.06451612903225806,
 'libe': 0.0,
 'loya': 0.12903225806451613,
 'mode': 0.03225806451612903,
 'resi': 0.03225806451612903,
 'sanc': 0.16129032258064516,
 'wast': 0.0}

In [19]:
df_chn['chn_moral'] = df_chn['广告文本'].apply(moral_quantity)
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题,chn_moral
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟""",
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也',"{'altr': 0.0, 'auth': 0.25, 'care': 0.25, 'dil..."
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花""",
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片""","{'altr': 0.0, 'auth': 0.2, 'care': 0.2, 'dili'..."
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友""","{'altr': 0.0, 'auth': 0.16666666666666666, 'ca..."


In [20]:
chn_moral_df = pd.DataFrame(columns=['altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'])

for dc in df_chn.index:
    if df_chn['chn_moral'][dc] == None:
        chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
    else:
        chn_moral_df.loc[len(chn_moral_df.index)] = list(df_chn['chn_moral'][dc].values())
        
df_chn = pd.concat([df_chn, chn_moral_df], axis=1)
        
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题,chn_moral,altr,auth,care,dili,fair,general,libe,loya,mode,resi,sanc,wast
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟""",,,,,,,,,,,,,
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也',"{'altr': 0.0, 'auth': 0.25, 'care': 0.25, 'dil...",0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花""",,,,,,,,,,,,,
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片""","{'altr': 0.0, 'auth': 0.2, 'care': 0.2, 'dili'...",0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友""","{'altr': 0.0, 'auth': 0.16666666666666666, 'ca...",0.0,0.166667,0.166667,0.0,0.055556,0.0,0.0,0.388889,0.0,0.0,0.222222,0.0


In [21]:
df_chn.columns

Index(['index', '商品名称', '商品类别', '出版年', '广告文本', '广告标题', 'chn_moral', 'altr',
       'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode',
       'resi', 'sanc', 'wast'],
      dtype='object')

## 双减教育部

In [22]:
df_1 = pd.read_excel('双减教育部.xlsx', index_col = 0)

In [23]:
df_1.head()

Unnamed: 0_level_0,类型,标题,全文内容,发布时间
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31
2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31
3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31
4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11
5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23


In [24]:
df_1['chn_moral'] = df_1['全文内容'].apply(moral_quantity)
df_1.head()

Unnamed: 0_level_0,类型,标题,全文内容,发布时间,chn_moral
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31,"{'altr': 0.0, 'auth': 0.5675675675675675, 'car..."
2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,"{'altr': 0.0, 'auth': 0.5833333333333334, 'car..."
3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,"{'altr': 0.0, 'auth': 0.5555555555555556, 'car..."
4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11,"{'altr': 0.0, 'auth': 0.6666666666666666, 'car..."
5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23,"{'altr': 0.0, 'auth': 0.4, 'care': 0.1, 'dili'..."


In [25]:
chn_moral_df = pd.DataFrame(columns=['altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'])

for dc in df_chn.index:
    if df_chn['chn_moral'][dc] == None:
        chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
    else:
        chn_moral_df.loc[len(chn_moral_df.index)] = list(df_chn['chn_moral'][dc].values())
        
df_1 = pd.concat([df_1, chn_moral_df], axis=1)
        
df_1.head()

Unnamed: 0,类型,标题,全文内容,发布时间,chn_moral,altr,auth,care,dili,fair,general,libe,loya,mode,resi,sanc,wast
1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31,"{'altr': 0.0, 'auth': 0.5675675675675675, 'car...",0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,"{'altr': 0.0, 'auth': 0.5833333333333334, 'car...",,,,,,,,,,,,
3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,"{'altr': 0.0, 'auth': 0.5555555555555556, 'car...",0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0
4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11,"{'altr': 0.0, 'auth': 0.6666666666666666, 'car...",0.0,0.166667,0.166667,0.0,0.055556,0.0,0.0,0.388889,0.0,0.0,0.222222,0.0
5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23,"{'altr': 0.0, 'auth': 0.4, 'care': 0.1, 'dili'...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df_1.columns

Index(['类型', '标题', '全文内容', '发布时间', 'chn_moral', 'altr', 'auth', 'care', 'dili',
       'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'],
      dtype='object')

## 双减热门微博

In [27]:
df_2 = pd.read_excel('双减热门微博.xlsx', index_col = 0)

In [28]:
df_2.head()

Unnamed: 0_level_0,博主昵称,微博内容,转发数,评论数,点赞数,发布时间
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38,3,31,2023年12月31日 12:00
2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15,57,378,2023年12月27日 18:42
3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4,51,81,2023年12月26日 12:23
4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10,27,187,2023年12月22日 21:46
5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3,53,289,2023年12月22日 20:48


In [30]:
df_2['chn_moral'] = df_2['微博内容'].apply(moral_quantity)
df_2.head()

Unnamed: 0_level_0,博主昵称,微博内容,转发数,评论数,点赞数,发布时间,chn_moral
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38,3,31,2023年12月31日 12:00,"{'altr': 0.0, 'auth': 0.5, 'care': 0.5, 'dili'..."
2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15,57,378,2023年12月27日 18:42,"{'altr': 0.0, 'auth': 0.0, 'care': 1.0, 'dili'..."
3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4,51,81,2023年12月26日 12:23,"{'altr': 0.0, 'auth': 1.0, 'care': 0.0, 'dili'..."
4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10,27,187,2023年12月22日 21:46,"{'altr': 0.0, 'auth': 0.0, 'care': 0.25, 'dili..."
5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3,53,289,2023年12月22日 20:48,"{'altr': 0.0, 'auth': 0.0, 'care': 0.0, 'dili'..."


In [31]:
chn_moral_df = pd.DataFrame(columns=['altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'])

for dc in df_chn.index:
    if df_chn['chn_moral'][dc] == None:
        chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
    else:
        chn_moral_df.loc[len(chn_moral_df.index)] = list(df_chn['chn_moral'][dc].values())
        
df_2 = pd.concat([df_2, chn_moral_df], axis=1)
        
df_2.head()

Unnamed: 0,博主昵称,微博内容,转发数,评论数,点赞数,发布时间,chn_moral,altr,auth,care,dili,fair,general,libe,loya,mode,resi,sanc,wast
1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38.0,3.0,31.0,2023年12月31日 12:00,"{'altr': 0.0, 'auth': 0.5, 'care': 0.5, 'dili'...",0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15.0,57.0,378.0,2023年12月27日 18:42,"{'altr': 0.0, 'auth': 0.0, 'care': 1.0, 'dili'...",,,,,,,,,,,,
3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4.0,51.0,81.0,2023年12月26日 12:23,"{'altr': 0.0, 'auth': 1.0, 'care': 0.0, 'dili'...",0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0
4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10.0,27.0,187.0,2023年12月22日 21:46,"{'altr': 0.0, 'auth': 0.0, 'care': 0.25, 'dili...",0.0,0.166667,0.166667,0.0,0.055556,0.0,0.0,0.388889,0.0,0.0,0.222222,0.0
5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3.0,53.0,289.0,2023年12月22日 20:48,"{'altr': 0.0, 'auth': 0.0, 'care': 0.0, 'dili'...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df_2.columns

Index(['博主昵称', '微博内容', '转发数', '评论数', '点赞数', '发布时间', 'chn_moral', 'altr',
       'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode',
       'resi', 'sanc', 'wast'],
      dtype='object')

## 双减热门微博评论

In [33]:
df_3 = pd.read_excel('双减热门微博评论.xlsx', index_col = 0)

In [34]:
df_3.head()

Unnamed: 0_level_0,用户昵称,评论,发布时间,发布地区
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东
2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州
3,Jemimaa_,谢谢姜姜,2024-05-21,贵州
4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东
5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江


In [37]:
df_3['chn_moral'] = df_3['评论'].apply(moral_quantity)
df_3.head()

Unnamed: 0_level_0,用户昵称,评论,发布时间,发布地区,chn_moral
序号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东,
2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州,
3,Jemimaa_,谢谢姜姜,2024-05-21,贵州,
4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东,
5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江,


In [38]:
chn_moral_df = pd.DataFrame(columns=['altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'])

for dc in df_chn.index:
    if df_chn['chn_moral'][dc] == None:
        chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
    else:
        chn_moral_df.loc[len(chn_moral_df.index)] = list(df_chn['chn_moral'][dc].values())
        
df_3 = pd.concat([df_3, chn_moral_df], axis=1)
        
df_3.head()

Unnamed: 0,用户昵称,评论,发布时间,发布地区,chn_moral,altr,auth,care,dili,fair,general,libe,loya,mode,resi,sanc,wast
1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东,,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州,,,,,,,,,,,,,
3,Jemimaa_,谢谢姜姜,2024-05-21,贵州,,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0
4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东,,0.0,0.166667,0.166667,0.0,0.055556,0.0,0.0,0.388889,0.0,0.0,0.222222,0.0
5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df_3.columns

Index(['用户昵称', '评论', '发布时间', '发布地区', 'chn_moral', 'altr', 'auth', 'care',
       'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc',
       'wast'],
      dtype='object')

# 情感词典

In [2]:
import pandas as pd     # 数据表
import jieba     # 中文分词

## 上课案例

In [3]:
df_eng = pd.read_excel('text_analysis_twitter_sample.xlsx', index_col = 0)

In [4]:
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client


In [5]:
df_chn = pd.read_excel('text_analysis_ad.xlsx', index_col = 0)

In [6]:
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟"""
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也'
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花"""
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片"""
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友"""


### NRC情感词典（NRC Emotion Lexicon）

In [7]:
nrc = pd.read_excel('NRC-Emotion-Lexicon.xlsx', usecols='A, F, AP:AY')
nrc = nrc.rename(columns={'English Word':'Engword', 'Chinese (simplified) Translation (Google Translate)':'Chnword'})
nrc.head()

  for idx, row in parser.parse():


Unnamed: 0,Engword,Chnword,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,aback,吓了一跳,0,0,0,0,0,0,0,0,0,0
1,abacus,算盘,0,0,0,0,0,0,0,0,0,1
2,abandon,放弃,0,1,0,0,0,1,0,1,0,0
3,abandoned,弃,0,1,1,0,0,1,0,1,0,0
4,abandonment,放弃,0,1,1,0,0,1,0,1,1,0


#### 英文语料

In [8]:
Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [], [], [], [], [], [], [], [], [], []

for idx, row in nrc.iterrows():
    if row['Positive'] == 1:
        Positive.append(row['Engword'])
    if row['Negative'] == 1:
        Negative.append(row['Engword'])
    if row['Anger'] == 1:
        Anger.append(row['Engword'])
    if row['Anticipation'] == 1:
        Anticipation.append(row['Engword'])
    if row['Disgust'] == 1:
        Disgust.append(row['Engword'])
    if row['Fear'] == 1:
        Fear.append(row['Engword'])
    if row['Joy'] == 1:
        Joy.append(row['Engword'])
    if row['Sadness'] == 1:
        Sadness.append(row['Engword'])
    if row['Surprise'] == 1:
        Surprise.append(row['Engword'])
    if row['Trust'] == 1:
        Trust.append(row['Engword'])

In [9]:
emo_nrc_eng = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for de in df_eng.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    text = df_eng['text'][de].lower()
    wordlist = text.split()
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_eng = pd.concat([emo_nrc_eng, emo_info], ignore_index=True)
    
emo_nrc_eng.head()

Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,22,2,0,0,1,0,0,0,0,0,2
1,18,1,0,0,1,0,0,1,0,1,1
2,16,1,0,0,0,0,0,0,0,0,0
3,22,1,1,1,0,0,2,0,0,0,0
4,20,0,1,1,0,1,1,0,1,1,1


In [10]:
df_eng = pd.concat([df_eng, emo_nrc_eng], axis=1)
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone,22,2,0,0,1,0,0,0,0,0,2
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client,18,1,0,0,1,0,0,1,0,1,1
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client,16,1,0,0,0,0,0,0,0,0,0
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client,22,1,1,1,0,0,2,0,0,0,0
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client,20,0,1,1,0,1,1,0,1,1,1


#### 中文语料

In [11]:
Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [], [], [], [], [], [], [], [], [], []

for idx, row in nrc.iterrows():
    if row['Positive'] == 1:
        Positive.append(row['Chnword'])
    if row['Negative'] == 1:
        Negative.append(row['Chnword'])
    if row['Anger'] == 1:
        Anger.append(row['Chnword'])
    if row['Anticipation'] == 1:
        Anticipation.append(row['Chnword'])
    if row['Disgust'] == 1:
        Disgust.append(row['Chnword'])
    if row['Fear'] == 1:
        Fear.append(row['Chnword'])
    if row['Joy'] == 1:
        Joy.append(row['Chnword'])
    if row['Sadness'] == 1:
        Sadness.append(row['Chnword'])
    if row['Surprise'] == 1:
        Surprise.append(row['Chnword'])
    if row['Trust'] == 1:
        Trust.append(row['Chnword'])

In [12]:
emo_nrc_chn = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for dc in df_chn.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_chn['广告文本'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_chn = pd.concat([emo_nrc_chn, emo_info], ignore_index=True)

emo_nrc_chn.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kk/ylyfvmrj6zv853wrvp3s_0180000gn/T/jieba.cache
Loading model cost 0.286 seconds.
Prefix dict has been built successfully.


Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,30,1,2,0,0,0,0,0,0,0,1
1,129,7,11,1,0,8,2,0,3,0,4
2,11,0,0,0,0,0,0,0,0,0,0
3,155,7,7,2,3,4,3,2,1,0,4
4,433,19,14,4,9,4,4,9,8,4,13


In [13]:
df_chn = pd.concat([df_chn, emo_nrc_chn], axis=1)
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟""",30,1,2,0,0,0,0,0,0,0,1
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也',129,7,11,1,0,8,2,0,3,0,4
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花""",11,0,0,0,0,0,0,0,0,0,0
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片""",155,7,7,2,3,4,3,2,1,0,4
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友""",433,19,14,4,9,4,4,9,8,4,13


### 大连理工大学情感词典（DLUT Emotion Lexicon）

In [14]:
dlut = pd.read_excel('DLUT-Emotion-Lexicon.xlsx', usecols=['词语', '词性种类', '情感分类', '强度'])
dlut.head()

Unnamed: 0,词语,词性种类,情感分类,强度
0,脏乱,adj,NN,7
1,糟报,adj,NN,5
2,早衰,adj,NE,5
3,责备,verb,NN,5
4,贼眼,noun,NN,5


In [15]:
# 整理情感词典
Happy, Good, Surprise, Anger, Sad, Fear, Disgust  = [], [], [], [], [], [], []

for idx, row in dlut.iterrows():
    if row['情感分类'] in ['PA', 'PE']:
        Happy.append(row['词语'])
    if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
        Good.append(row['词语']) 
    if row['情感分类'] in ['PC']:
        Surprise.append(row['词语'])     
    if row['情感分类'] in ['NA']:
        Anger.append(row['词语'])    
    if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
        Sad.append(row['词语'])
    if row['情感分类'] in ['NI', 'NC', 'NG']:
        Fear.append(row['词语'])
    if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
        Disgust.append(row['词语'])
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust

In [16]:
emo_dlut = pd.DataFrame(columns=['length_dlut', 'positive_dlut', 'negative_dlut',
                                'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut',
                                'sadness_dlut', 'surprise_dlut', 'happy_dlut'])

for dc in df_chn.index:
    positive, negative, anger, disgust, fear, sad, surprise, good, happy = 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_chn['广告文本'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Sad:
            sad += freq
        if word in Surprise:
            surprise += freq
        if word in Good:
            good += freq
        if word in Happy:
            happy += freq
            
    emotion_info = {
        'length_dlut': len(wordlist),
        'positive_dlut': positive,
        'negative_dlut': negative,
        'anger_dlut': anger,
        'disgust_dlut': disgust,
        'fear_dlut': fear,
        'good_dlut': good,
        'sadness_dlut': sad,
        'surprise_dlut': surprise,
        'happy_dlut': happy
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_dlut = pd.concat([emo_dlut, emo_info], ignore_index=True)
    
emo_dlut.head()

Unnamed: 0,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,30,0,0,0,0,0,0,0,0,0
1,129,9,6,0,3,2,9,1,0,0
2,11,0,0,0,0,0,0,0,0,0
3,155,7,6,0,5,0,7,1,0,0
4,433,34,16,0,14,1,23,1,0,11


In [17]:
df_chn = pd.concat([df_chn, emo_dlut], axis=1)
df_chn.head()

Unnamed: 0,index,商品名称,商品类别,出版年,广告文本,广告标题,length_nrc,positive_nrc,negative_nrc,anger_nrc,...,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,1596,五华牌香烟[May Blossom],烟草制品,1932,兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟 二十枝装每包售国币大洋二角 五十枝装每罐售国币...,"五华牌香烟,""兰勃脱白脱勒公司 五华牌香烟 佛及尼埃香烟""",30,1,2,0,...,30,0,0,0,0,0,0,0,0,0
1,5627,韦廉士红色清导丸[Dr.Willams' Pink Pills For Pale People],药品,1918,讲求卫生为人生本性天理固然也 人生首贵逐日大便通畅有序为天然所当如此也如若大便不利大肠阻塞则...,韦廉士红色清导丸:'讲求卫生为人生本性天理固然也',129,7,11,1,...,129,9,6,0,3,2,9,1,0,0
2,13532,大炮台香烟[Three Castles Cigarettes],烟草制品,1935,"香味馥郁,不让名花 另有三炮台出售","大炮台香烟,""香味馥郁 不让名花""",11,0,0,0,...,11,0,0,0,0,0,0,0,0,0
3,1133,婴孩自己药片[Baby's Own],药品,1930,差肩儿女 秀慧康强 闽有佳音讃羡婴孩自己药片 每年此际小儿患肠胃病者甚多而尤以南方各地天气翳...,"婴孩自己药片,""差肩儿女 秀慧康强 闽有佳音赞美婴孩自己药片""",155,7,7,2,...,155,7,6,0,5,0,7,1,0,0
4,3146,亚士北罗药片[Aspro],药品,1933,何以亚士北罗是妇女们的腻友？各国妇女力证亚士北罗药片是她们最需要的药物！为什么？她们的经验知...,"亚士北罗药片,""妇女之腻友""",433,19,14,4,...,433,34,16,0,14,1,23,1,0,11


In [18]:
df_chn.columns

Index(['index', '商品名称', '商品类别', '出版年', '广告文本', '广告标题', 'length_nrc',
       'positive_nrc', 'negative_nrc', 'anger_nrc', 'anticipation_nrc',
       'disgust_nrc', 'fear_nrc', 'joy_nrc', 'sadness_nrc', 'surprise_nrc',
       'trust_nrc', 'length_dlut', 'positive_dlut', 'negative_dlut',
       'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut', 'sadness_dlut',
       'surprise_dlut', 'happy_dlut'],
      dtype='object')

## 双减教育部

In [19]:
df_1 = pd.read_excel('双减教育部.xlsx').astype(str)

In [20]:
df_1.head()

Unnamed: 0,序号,类型,标题,全文内容,发布时间
0,1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31
1,2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31
2,3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31
3,4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11
4,5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23


### NRC情感词典（NRC Emotion Lexicon）

In [21]:
nrc = pd.read_excel('NRC-Emotion-Lexicon.xlsx', usecols='A, F, AP:AY')
nrc = nrc.rename(columns={'English Word':'Engword', 'Chinese (simplified) Translation (Google Translate)':'Chnword'})
nrc.head()

  for idx, row in parser.parse():


Unnamed: 0,Engword,Chnword,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,aback,吓了一跳,0,0,0,0,0,0,0,0,0,0
1,abacus,算盘,0,0,0,0,0,0,0,0,0,1
2,abandon,放弃,0,1,0,0,0,1,0,1,0,0
3,abandoned,弃,0,1,1,0,0,1,0,1,0,0
4,abandonment,放弃,0,1,1,0,0,1,0,1,1,0


In [22]:
Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [], [], [], [], [], [], [], [], [], []

for idx, row in nrc.iterrows():
    if row['Positive'] == 1:
        Positive.append(row['Chnword'])
    if row['Negative'] == 1:
        Negative.append(row['Chnword'])
    if row['Anger'] == 1:
        Anger.append(row['Chnword'])
    if row['Anticipation'] == 1:
        Anticipation.append(row['Chnword'])
    if row['Disgust'] == 1:
        Disgust.append(row['Chnword'])
    if row['Fear'] == 1:
        Fear.append(row['Chnword'])
    if row['Joy'] == 1:
        Joy.append(row['Chnword'])
    if row['Sadness'] == 1:
        Sadness.append(row['Chnword'])
    if row['Surprise'] == 1:
        Surprise.append(row['Chnword'])
    if row['Trust'] == 1:
        Trust.append(row['Chnword'])

In [23]:
emo_nrc_1 = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for dc in df_1.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_1['全文内容'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_1 = pd.concat([emo_nrc_1, emo_info], ignore_index=True)

emo_nrc_1.head()

Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,467,53,21,4,14,5,12,13,9,0,35
1,470,51,19,4,14,5,12,13,7,0,34
2,315,40,11,2,14,4,7,12,2,1,27
3,662,112,16,8,20,6,10,18,3,11,41
4,451,88,9,4,16,3,6,13,3,4,33


In [24]:
df_1 = pd.concat([df_1, emo_nrc_1], axis=1)
df_1.head()

Unnamed: 0,序号,类型,标题,全文内容,发布时间,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31,467,53,21,4,14,5,12,13,9,0,35
1,2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,470,51,19,4,14,5,12,13,7,0,34
2,3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,315,40,11,2,14,4,7,12,2,1,27
3,4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11,662,112,16,8,20,6,10,18,3,11,41
4,5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23,451,88,9,4,16,3,6,13,3,4,33


### 大连理工大学情感词典（DLUT Emotion Lexicon）

In [25]:
dlut = pd.read_excel('DLUT-Emotion-Lexicon.xlsx', usecols=['词语', '词性种类', '情感分类', '强度'])
dlut.head()

Unnamed: 0,词语,词性种类,情感分类,强度
0,脏乱,adj,NN,7
1,糟报,adj,NN,5
2,早衰,adj,NE,5
3,责备,verb,NN,5
4,贼眼,noun,NN,5


In [26]:
# 整理情感词典
Happy, Good, Surprise, Anger, Sad, Fear, Disgust  = [], [], [], [], [], [], []

for idx, row in dlut.iterrows():
    if row['情感分类'] in ['PA', 'PE']:
        Happy.append(row['词语'])
    if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
        Good.append(row['词语']) 
    if row['情感分类'] in ['PC']:
        Surprise.append(row['词语'])     
    if row['情感分类'] in ['NA']:
        Anger.append(row['词语'])    
    if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
        Sad.append(row['词语'])
    if row['情感分类'] in ['NI', 'NC', 'NG']:
        Fear.append(row['词语'])
    if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
        Disgust.append(row['词语'])
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust

In [27]:
emo_dlut_1 = pd.DataFrame(columns=['length_dlut', 'positive_dlut', 'negative_dlut',
                                'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut',
                                'sadness_dlut', 'surprise_dlut', 'happy_dlut'])

for dc in df_1.index:
    positive, negative, anger, disgust, fear, sad, surprise, good, happy = 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_1['全文内容'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Sad:
            sad += freq
        if word in Surprise:
            surprise += freq
        if word in Good:
            good += freq
        if word in Happy:
            happy += freq
            
    emotion_info = {
        'length_dlut': len(wordlist),
        'positive_dlut': positive,
        'negative_dlut': negative,
        'anger_dlut': anger,
        'disgust_dlut': disgust,
        'fear_dlut': fear,
        'good_dlut': good,
        'sadness_dlut': sad,
        'surprise_dlut': surprise,
        'happy_dlut': happy
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_dlut_1 = pd.concat([emo_dlut_1, emo_info], ignore_index=True)
    
emo_dlut_1.head()

Unnamed: 0,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,467,31,6,0,2,1,26,3,0,5
1,470,31,5,0,2,1,25,2,0,6
2,315,22,3,0,1,1,22,1,0,0
3,662,82,3,0,2,1,74,0,0,8
4,451,58,2,0,1,1,52,0,0,6


In [29]:
df_1 = pd.concat([df_1, emo_dlut], axis=1)
df_1.head()

Unnamed: 0,序号,类型,标题,全文内容,发布时间,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,...,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,1,新闻,[中国网]教育部回应“双减”传闻：从严审批培训机构 严控作业总量,中国网北京3月31日讯（记者 徐虹 刘佳）3月31日，国务院新闻办公室就贯彻“十四五”规划，...,2021-03-31,467,53,21,4,14,...,30,0,0,0,0,0,0,0,0,0
1,2,新闻,[北京青年报]教育部回应“双减”传闻：切实减轻学生校外培训负担和作业负担,3月31日，国务院新闻办公室举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,470,51,19,4,14,...,129,9,6,0,3,2,9,1,0,0
2,3,新闻,[中国教育在线]教育部回应“双减”问题：今年把校外培训机构治理列入重点工作任务,3月31日上午，国务院新闻办举行新闻发布会，介绍深入贯彻“十四五”规划，加快建设高质量教育体...,2021-03-31,315,40,11,2,14,...,11,0,0,0,0,0,0,0,0,0
3,4,机构,陈宝生调研北京教育工作,4月9日，中央教育工作领导小组秘书组组长、教育部党组书记、部长陈宝生赴北京市调研教育综合改革...,2021-04-11,662,112,16,8,20,...,155,7,6,0,5,0,7,1,0,0
4,5,新闻,孙春兰在安徽调研时强调 推动全国职业教育大会精神落地落实 加快构建现代职业教育体系,新华社合肥4月23日电 中共中央政治局委员、国务院副总理孙春兰22日至23日在安徽调研时强...,2021-04-23,451,88,9,4,16,...,433,34,16,0,14,1,23,1,0,11


In [30]:
df_1.columns

Index(['序号', '类型', '标题', '全文内容', '发布时间', 'length_nrc', 'positive_nrc',
       'negative_nrc', 'anger_nrc', 'anticipation_nrc', 'disgust_nrc',
       'fear_nrc', 'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc',
       'length_dlut', 'positive_dlut', 'negative_dlut', 'anger_dlut',
       'disgust_dlut', 'fear_dlut', 'good_dlut', 'sadness_dlut',
       'surprise_dlut', 'happy_dlut'],
      dtype='object')

## 双减热门微博

In [31]:
df_2 = pd.read_excel('双减热门微博.xlsx').astype(str)

In [32]:
df_2.head()

Unnamed: 0,序号,博主昵称,微博内容,转发数,评论数,点赞数,发布时间
0,1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38,3,31,2023年12月31日 12:00
1,2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15,57,378,2023年12月27日 18:42
2,3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4,51,81,2023年12月26日 12:23
3,4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10,27,187,2023年12月22日 21:46
4,5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3,53,289,2023年12月22日 20:48


### NRC情感词典（NRC Emotion Lexicon）

In [34]:
emo_nrc_2 = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for dc in df_2.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_2['微博内容'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_2 = pd.concat([emo_nrc_2, emo_info], ignore_index=True)

emo_nrc_2.head()

Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,262,6,1,0,7,1,0,7,3,0,14
1,123,3,7,0,2,2,3,3,3,0,5
2,57,1,5,0,1,4,1,0,0,0,2
3,74,2,1,0,4,0,0,2,2,0,2
4,205,18,2,0,1,1,0,3,4,0,3


In [35]:
df_2 = pd.concat([df_2, emo_nrc_2], axis=1)
df_2.head()

Unnamed: 0,序号,博主昵称,微博内容,转发数,评论数,点赞数,发布时间,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38,3,31,2023年12月31日 12:00,262,6,1,0,7,1,0,7,3,0,14
1,2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15,57,378,2023年12月27日 18:42,123,3,7,0,2,2,3,3,3,0,5
2,3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4,51,81,2023年12月26日 12:23,57,1,5,0,1,4,1,0,0,0,2
3,4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10,27,187,2023年12月22日 21:46,74,2,1,0,4,0,0,2,2,0,2
4,5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3,53,289,2023年12月22日 20:48,205,18,2,0,1,1,0,3,4,0,3


### 大连理工大学情感词典（DLUT Emotion Lexicon）

In [37]:
dlut = pd.read_excel('DLUT-Emotion-Lexicon.xlsx', usecols=['词语', '词性种类', '情感分类', '强度'])
dlut.head()

Unnamed: 0,词语,词性种类,情感分类,强度
0,脏乱,adj,NN,7
1,糟报,adj,NN,5
2,早衰,adj,NE,5
3,责备,verb,NN,5
4,贼眼,noun,NN,5


In [38]:
# 整理情感词典
Happy, Good, Surprise, Anger, Sad, Fear, Disgust  = [], [], [], [], [], [], []

for idx, row in dlut.iterrows():
    if row['情感分类'] in ['PA', 'PE']:
        Happy.append(row['词语'])
    if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
        Good.append(row['词语']) 
    if row['情感分类'] in ['PC']:
        Surprise.append(row['词语'])     
    if row['情感分类'] in ['NA']:
        Anger.append(row['词语'])    
    if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
        Sad.append(row['词语'])
    if row['情感分类'] in ['NI', 'NC', 'NG']:
        Fear.append(row['词语'])
    if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
        Disgust.append(row['词语'])
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust

In [39]:
emo_dlut_2 = pd.DataFrame(columns=['length_dlut', 'positive_dlut', 'negative_dlut',
                                'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut',
                                'sadness_dlut', 'surprise_dlut', 'happy_dlut'])

for dc in df_2.index:
    positive, negative, anger, disgust, fear, sad, surprise, good, happy = 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_2['微博内容'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Sad:
            sad += freq
        if word in Surprise:
            surprise += freq
        if word in Good:
            good += freq
        if word in Happy:
            happy += freq
            
    emotion_info = {
        'length_dlut': len(wordlist),
        'positive_dlut': positive,
        'negative_dlut': negative,
        'anger_dlut': anger,
        'disgust_dlut': disgust,
        'fear_dlut': fear,
        'good_dlut': good,
        'sadness_dlut': sad,
        'surprise_dlut': surprise,
        'happy_dlut': happy
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_dlut_2 = pd.concat([emo_dlut_2, emo_info], ignore_index=True)
    
emo_dlut_2.head()

Unnamed: 0,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,262,6,1,0,1,0,2,0,0,4
1,123,3,7,0,2,3,3,2,0,0
2,57,1,5,0,4,1,1,0,0,0
3,74,2,1,0,0,0,1,1,0,1
4,205,18,2,0,1,0,15,1,0,3


In [40]:
df_2 = pd.concat([df_2, emo_dlut_2], axis=1)
df_2.head()

Unnamed: 0,序号,博主昵称,微博内容,转发数,评论数,点赞数,发布时间,length_nrc,positive_nrc,negative_nrc,...,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,1,学**,#双减政策# 看到一项题为“Academic Stress and Mental healt...,38,3,31,2023年12月31日 12:00,262,6,1,...,262,6,1,0,1,0,2,0,0,4
1,2,金**,我看了一下，历史上新东方股价最高$199，最低$8，现在$71，前几周$85。无论股市还是房...,15,57,378,2023年12月27日 18:42,123,3,7,...,123,3,7,0,2,3,3,2,0,0
2,3,田**,看到了么？双减彻底，就是我们这样的，家长亲自下场，不然一个学期下来，全是窟窿。学校抓紧一点，...,4,51,81,2023年12月26日 12:23,57,1,5,...,57,1,5,0,4,1,1,0,0,0
3,4,兔**,如果一个人在版号限制那年从游戏转行干了教培，结婚买了恒大的期房，再遇到双减，努力大半年凭借好...,10,27,187,2023年12月22日 21:46,74,2,1,...,74,2,1,0,0,0,1,1,0,1
4,5,目**,看到有朋友说，从来没见过跌停的etf，今天游戏etf让人大开眼界。也有朋友把这个“网游新规征...,3,53,289,2023年12月22日 20:48,205,18,2,...,205,18,2,0,1,0,15,1,0,3


In [41]:
df_2.columns

Index(['序号', '博主昵称', '微博内容', '转发数', '评论数', '点赞数', '发布时间', 'length_nrc',
       'positive_nrc', 'negative_nrc', 'anger_nrc', 'anticipation_nrc',
       'disgust_nrc', 'fear_nrc', 'joy_nrc', 'sadness_nrc', 'surprise_nrc',
       'trust_nrc', 'length_dlut', 'positive_dlut', 'negative_dlut',
       'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut', 'sadness_dlut',
       'surprise_dlut', 'happy_dlut'],
      dtype='object')

## 双减热门微博评论

In [42]:
df_3 = pd.read_excel('双减热门微博评论.xlsx').astype(str)

In [43]:
df_3.head()

Unnamed: 0,序号,用户昵称,评论,发布时间,发布地区
0,1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东
1,2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州
2,3,Jemimaa_,谢谢姜姜,2024-05-21,贵州
3,4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东
4,5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江


### NRC情感词典（NRC Emotion Lexicon）

In [44]:
emo_nrc_3 = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for dc in df_3.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_3['评论'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_3 = pd.concat([emo_nrc_3, emo_info], ignore_index=True)

emo_nrc_3.head()

Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,6,0,0,0,0,0,0,0,0,0,0
1,23,0,1,0,1,1,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,0


In [45]:
df_3 = pd.concat([df_3, emo_nrc_3], axis=1)
df_3.head()

Unnamed: 0,序号,用户昵称,评论,发布时间,发布地区,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东,6,0,0,0,0,0,0,0,0,0,0
1,2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州,23,0,1,0,1,1,0,0,0,0,0
2,3,Jemimaa_,谢谢姜姜,2024-05-21,贵州,2,0,0,0,0,0,0,0,0,0,0
3,4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东,2,0,0,0,0,0,0,0,0,0,0
4,5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江,4,0,0,0,0,0,0,0,0,0,0


### 大连理工大学情感词典（DLUT Emotion Lexicon）

In [46]:
emo_dlut_3 = pd.DataFrame(columns=['length_dlut', 'positive_dlut', 'negative_dlut',
                                'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut',
                                'sadness_dlut', 'surprise_dlut', 'happy_dlut'])

for dc in df_3.index:
    positive, negative, anger, disgust, fear, sad, surprise, good, happy = 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_3['评论'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Sad:
            sad += freq
        if word in Surprise:
            surprise += freq
        if word in Good:
            good += freq
        if word in Happy:
            happy += freq
            
    emotion_info = {
        'length_dlut': len(wordlist),
        'positive_dlut': positive,
        'negative_dlut': negative,
        'anger_dlut': anger,
        'disgust_dlut': disgust,
        'fear_dlut': fear,
        'good_dlut': good,
        'sadness_dlut': sad,
        'surprise_dlut': surprise,
        'happy_dlut': happy
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_dlut_3 = pd.concat([emo_dlut_3, emo_info], ignore_index=True)
    
emo_dlut_3.head()

Unnamed: 0,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,6,0,0,0,0,0,0,0,0,0
1,23,0,1,0,1,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0


In [47]:
df_3 = pd.concat([df_3, emo_dlut_3], axis=1)
df_3.head()

Unnamed: 0,序号,用户昵称,评论,发布时间,发布地区,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,...,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,1,姜姜是个小叮当,现在链接可以下载啦！,2024-05-20,山东,6,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
1,2,用户7406707968,老师，为什么买来您的结构化视频课程，看不了啊？ 客服也联系不上 怎么处理呢,2024-05-20,贵州,23,0,1,0,1,...,23,0,1,0,1,0,0,0,0,0
2,3,Jemimaa_,谢谢姜姜,2024-05-21,贵州,2,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3,4,LLLLLLL_YAN_Q,过期了,2024-05-28,广东,2,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
4,5,桐桐李凤桐,过期啦~求补,2024-05-28,黑龙江,4,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0


In [48]:
df_3.columns

Index(['序号', '用户昵称', '评论', '发布时间', '发布地区', 'length_nrc', 'positive_nrc',
       'negative_nrc', 'anger_nrc', 'anticipation_nrc', 'disgust_nrc',
       'fear_nrc', 'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc',
       'length_dlut', 'positive_dlut', 'negative_dlut', 'anger_dlut',
       'disgust_dlut', 'fear_dlut', 'good_dlut', 'sadness_dlut',
       'surprise_dlut', 'happy_dlut'],
      dtype='object')