# News Preprocessing Using Ckiplab NLP Package

In [1]:
import pandas as pd
import numpy
from collections import Counter
from snownlp import SnowNLP
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

df = pd.read_csv('now_category_news.csv', sep='|')

ws = CkipWordSegmenter(level=2)
pos = CkipPosTagger(level=2)
ner = CkipNerChunker(level=2)

tokens = ws(df.content)

tokens_pos = pos(tokens)

word_pos_pair = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

entity_list = ner(df.content)

allowPOS = ['Na', 'Nb', 'Nc', 'VA', 'VAC', 'VB', 'VC']

tokens_v2 = []
for wp in word_pos_pair:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = word_pos_pair

allowPOS = ['Na', 'Nb', 'Nc', 'VC']

def word_frequency(wp_pair):
    filtered_words = []
    for word, pos in wp_pair:
        if (pos in allowPOS) & (len(word) >= 2):
            filtered_words.append(word)

    counter = Counter(filtered_words)
    return counter.most_common(200)


keyfreqs = []
for wp in word_pos_pair:
    topwords = word_frequency(wp)
    keyfreqs.append(topwords)

df['top_key_freq'] = keyfreqs

summary = []
sentiment = []
for text in df.content:
    sn = SnowNLP(text)
    summary.append(sn.summary())
    sentiment.append(round(sn.sentiments, 2))

df['summary'] = summary
df['sentiment'] = sentiment

df = df[[
    'item_id', 'date','category', 'title', 'content', 'sentiment', 'summary',
    'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos', 'link',
    'photo_link'
]]

df.to_csv('now_news_preprocessed.csv', sep='|', index=False)

print("process OK!")

Tokenization: 100%|██████████| 3671/3671 [00:03<00:00, 963.46it/s] 
Inference: 100%|██████████| 29/29 [58:32<00:00, 121.11s/it]
Tokenization: 100%|██████████| 3671/3671 [00:02<00:00, 1229.41it/s]
Inference: 100%|██████████| 749/749 [15:59:19<00:00, 76.85s/it]  
Tokenization: 100%|██████████| 3671/3671 [00:04<00:00, 787.93it/s] 
Inference: 100%|██████████| 29/29 [1:01:58<00:00, 128.22s/it]


process OK!


In [2]:
df = pd.read_csv('now_news_preprocessed.csv', sep='|')
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,column/_20220505_1001,2022-05-05,焦點,配發率32%！富邦金每股配4元股利、現金股利歷史新高,富邦金控今（5）日公告，董事會決議通過擬配發普通股每股現金股利3.5元及股票股利0.5元，合...,0.0,"['富邦金這次同時配發股票股利每股0.5元', '包含富邦人壽上繳155.78億元、台北富邦...","[('股利', 18), ('富邦', 12), ('現金', 9), ('股票', 5),...","['富邦', '金控', '今', '（5', '）', '日', '公告', '，', '...","['富邦', '金控', '董事會', '通過', '普通股', '現金', '股利', '...","[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)),...","[('富邦', 'Nb'), ('金控', 'Na'), ('今', 'Nd'), ('（5...",https://www.nownews.com/news/5796259,https://media.nownews.com/nn_media/thumbnail/2...


## Count top 200 word frequency for each category

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('now_news_preprocessed.csv',sep='|')

In [5]:
from collections import Counter

In [6]:
news_links = ['column/', 'news-summary/', 'breaking/', 'entertainment/', 'novelty/', 'life/', 'finance/', 'st/', 'news-global/', 'sport/']
news_categories = ['焦點', '要聞', '即時', '娛樂', '新奇', '生活', '財經', '專題', '全球', '運動']

In [7]:
allowedPOS=['Na', 'Nb', 'Nc']

In [8]:
def get_top_words():
    top_cate_words={}
    counter_all = Counter()
    for category in news_categories:
        df_group = df[df.category == category]
        words_group = []
        for row in df_group.token_pos:
            filtered_words = []
            for (word, pos) in eval(row):
                if (len(word) >= 2) & (pos in allowedPOS):
                    filtered_words.append(word)                
            words_group += filtered_words
        counter = Counter( words_group )
        counter_all += counter
        topwords = counter.most_common(200)
        top_cate_words[category] = topwords
    top_cate_words['全部'] = counter_all.most_common(200)
    return list(top_cate_words.items())

In [9]:
result = get_top_words()

In [10]:
top_group_words = get_top_words()

In [11]:
df_top_group_words = pd.DataFrame(top_group_words, columns = ['category','top_keys'])

In [12]:
df_top_group_words

Unnamed: 0,category,top_keys
0,焦點,"[(台灣, 555), (美國, 387), (疫情, 369), (中國, 341), (..."
1,要聞,"[(台灣, 576), (疫情, 554), (中央, 309), (政府, 285), (..."
2,即時,"[(台灣, 532), (疫情, 392), (網友, 357), (公司, 223), (..."
3,娛樂,"[(網友, 332), (粉絲, 292), (汪小菲, 239), (台灣, 208), ..."
4,新奇,"[(網友, 645), (編輯, 289), (台灣, 240), (貼文, 236), (..."
5,生活,"[(台灣, 301), (網友, 254), (疫情, 239), (民眾, 231), (..."
6,財經,"[(台灣, 435), (公司, 303), (能源, 293), (市場, 279), (..."
7,專題,"[(女優, 2082), (男優, 1890), (台灣, 1272), (產業, 981)..."
8,全球,"[(中國, 732), (美國, 638), (烏克蘭, 422), (國家, 378), ..."
9,運動,"[(球員, 411), (球隊, 365), (比賽, 339), (勇士, 338), (..."


In [13]:
df_top_group_words.to_csv('now_news_topkeys_with_category_via_token_pos.csv', index=False)

## Count top 200 persons via NER

In [14]:
df = pd.read_csv('now_news_preprocessed.csv',sep='|')

In [15]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,column/_20220505_1001,2022-05-05,焦點,配發率32%！富邦金每股配4元股利、現金股利歷史新高,富邦金控今（5）日公告，董事會決議通過擬配發普通股每股現金股利3.5元及股票股利0.5元，合...,0.0,"['富邦金這次同時配發股票股利每股0.5元', '包含富邦人壽上繳155.78億元、台北富邦...","[('股利', 18), ('富邦', 12), ('現金', 9), ('股票', 5),...","['富邦', '金控', '今', '（5', '）', '日', '公告', '，', '...","['富邦', '金控', '董事會', '通過', '普通股', '現金', '股利', '...","[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)),...","[('富邦', 'Nb'), ('金控', 'Na'), ('今', 'Nd'), ('（5...",https://www.nownews.com/news/5796259,https://media.nownews.com/nn_media/thumbnail/2...


In [16]:
df.entities[0]

"[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)), NerToken(word='3.5元', ner='MONEY', idx=(31, 35)), NerToken(word='0.5元', ner='MONEY', idx=(40, 44)), NerToken(word='4.0元', ner='MONEY', idx=(52, 56)), NerToken(word='32%', ner='PERCENT', idx=(77, 80)), NerToken(word='富邦金', ner='ORG', idx=(86, 89)), NerToken(word='2021年', ner='DATE', idx=(116, 121)), NerToken(word='1,445.59億元', ner='MONEY', idx=(132, 142)), NerToken(word='富邦金', ner='ORG', idx=(170, 173)), NerToken(word='去年', ner='DATE', idx=(182, 184)), NerToken(word='新台幣3元', ner='MONEY', idx=(187, 192)), NerToken(word='3.5元', ner='MONEY', idx=(200, 204)), NerToken(word='俄', ner='GPE', idx=(262, 263)), NerToken(word='富邦金控', ner='ORG', idx=(336, 340)), NerToken(word='富邦人壽', ner='ORG', idx=(349, 353)), NerToken(word='155.78億元', ner='MONEY', idx=(355, 363)), NerToken(word='台北富邦銀行', ner='ORG', idx=(364, 370)), NerToken(word='富邦', ner='ORG', idx=(380, 382)), NerToken(word='47.99億元', ner='MONEY', idx=(386, 393)), NerToken(word='富邦', ner='ORG', id

In [17]:
def NerToken(word, ner, idx):
    return ner,word


In [18]:
for ner,key in eval(df.entities[0]):
    print(ner,key)

ORG 富邦金控
MONEY 3.5元
MONEY 0.5元
MONEY 4.0元
PERCENT 32%
ORG 富邦金
DATE 2021年
MONEY 1,445.59億元
ORG 富邦金
DATE 去年
MONEY 新台幣3元
MONEY 3.5元
GPE 俄
ORG 富邦金控
ORG 富邦人壽
MONEY 155.78億元
ORG 台北富邦銀行
ORG 富邦
MONEY 47.99億元
ORG 富邦
MONEY 1.43億元
ORG 富邦人壽
ORG 富邦人壽
PERCENT 5.2%
ORG 富邦
ORG 台北富邦銀行
ORG 北富銀
DATE 2021年
PERCENT 38.5%
ORG 富邦
ORG 富邦
ORG 富邦金
MONEY 0.5元
ORG 富邦金
ORG 金管會
CARDINAL 6
CARDINAL 500億


In [19]:
from collections import Counter

In [20]:
allowedNE = ['PERSON']

news_categories = ['焦點', '要聞', '即時', '娛樂', '新奇', '生活', '財經', '專題', '全球', '運動']

In [21]:
def ne_word_frequency( a_news_ne ):
    filtered_words =[]
    for ner,word in a_news_ne:
        if (len(word) >= 2) & (ner in allowedNE):
            filtered_words.append(word)
    counter = Counter( filtered_words )
    return counter.most_common( 200 )

In [22]:
def get_top_ner_words():
    top_cate_ner_words={}
    words_all=[]
    for category in news_categories:
        df_group = df[df.category == category]
        words_group = []

        # concatenate terms in a category
        for row in df_group.entities:
            words_group += eval(row)

        # concatenate all terms
        words_all += words_group

        # Get top words by calling ne_word_frequency() function
        topwords = ne_word_frequency( words_group )
        top_cate_ner_words[category] = topwords

    topwords_all = ne_word_frequency(words_all)
    top_cate_ner_words['全部'] = topwords_all
    
    return list(top_cate_ner_words.items())
    # return top_cate_ne_words

In [23]:
%%time
hotPersons = get_top_ner_words()

CPU times: total: 1.36 s
Wall time: 1.4 s


In [24]:
hotPersons

[('焦點',
  [('陳時中', 113),
   ('柯文哲', 83),
   ('蔡英文', 81),
   ('汪小菲', 64),
   ('張善政', 63),
   ('馬可仕', 62),
   ('拜登', 53),
   ('蒲亭', 49),
   ('伊美黛', 48),
   ('葛斯齊', 41),
   ('黃珊珊', 38),
   ('于北辰', 34),
   ('王婉諭', 32),
   ('蔣萬安', 31),
   ('朱立倫', 31),
   ('林佳龍', 31),
   ('林智堅', 29),
   ('習近平', 28),
   ('鄭文燦', 25),
   ('侯友宜', 24),
   ('卡蜜兒', 24),
   ('羅智強', 23),
   ('沈慧虹', 23),
   ('蘇紫雲', 23),
   ('鄒幸彤', 23),
   ('金正恩', 22),
   ('蘇貞昌', 22),
   ('柯拉蓉', 22),
   ('孫唯容', 21),
   ('澤倫斯基', 21),
   ('陳苾暐', 21),
   ('戴普', 21),
   ('郭彥均', 20),
   ('邱志偉', 19),
   ('馬克宏', 18),
   ('馬英九', 18),
   ('高嘉瑜', 17),
   ('黃亭偉', 17),
   ('奈特', 17),
   ('艾班尼斯', 16),
   ('艾爾段', 16),
   ('岸田文雄', 15),
   ('顏若芳', 14),
   ('羅一鈞', 14),
   ('安珀', 14),
   ('陳偉華', 13),
   ('余天', 13),
   ('莊人祥', 13),
   ('易俊宏', 12),
   ('鄭運鵬', 12),
   ('艾奎諾', 12),
   ('艾奎諾二世', 12),
   ('莫里森', 11),
   ('邱奕勝', 11),
   ('陳有忠', 11),
   ('何博文', 10),
   ('張麗善', 10),
   ('張穎穎', 10),
   ('吳佩芸', 10),
   ('陳其邁', 9),
   ('林義傑', 9),
   ('徐巧芯', 9),
   

In [25]:
df_hotPersons = pd.DataFrame(hotPersons, columns = ['category','top_keys'])

In [26]:
df_hotPersons

Unnamed: 0,category,top_keys
0,焦點,"[(陳時中, 113), (柯文哲, 83), (蔡英文, 81), (汪小菲, 64), ..."
1,要聞,"[(柯文哲, 216), (侯友宜, 150), (陳時中, 126), (蔡英文, 119..."
2,即時,"[(陳時中, 90), (汪小菲, 74), (馬可仕, 67), (葛斯齊, 44), (..."
3,娛樂,"[(汪小菲, 235), (葛斯齊, 100), (張穎穎, 90), (納豆, 56), ..."
4,新奇,"[(鍾怡婷, 42), (張志浩, 39), (詹鎰睿, 34), (高敏敏, 33), (..."
5,生活,"[(柯文哲, 116), (陳時中, 85), (黃珊珊, 53), (鄭文燦, 48), ..."
6,財經,"[(王雲怡, 55), (林伯修, 45), (鄧振中, 43), (王守誠, 30), (..."
7,專題,"[(Jacky, 564), (吳夢夢, 417), (鄧佳華, 276), (孟若羽, 2..."
8,全球,"[(拜登, 142), (馬可仕, 85), (蒲亭, 74), (杜特蒂, 53), (王..."
9,運動,"[(塞爾提克, 70), (大谷, 45), (桑尼, 39), (大谷翔平, 37), (..."


In [27]:
df_hotPersons.to_csv('now_news_top_person_by_category_via_ner.csv', sep='|', index=False)

# Count top-200 hot keywords for each NER name

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('now_news_preprocessed.csv',sep='|')

In [30]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,column/_20220505_1001,2022-05-05,焦點,配發率32%！富邦金每股配4元股利、現金股利歷史新高,富邦金控今（5）日公告，董事會決議通過擬配發普通股每股現金股利3.5元及股票股利0.5元，合...,0.0,"['富邦金這次同時配發股票股利每股0.5元', '包含富邦人壽上繳155.78億元、台北富邦...","[('股利', 18), ('富邦', 12), ('現金', 9), ('股票', 5),...","['富邦', '金控', '今', '（5', '）', '日', '公告', '，', '...","['富邦', '金控', '董事會', '通過', '普通股', '現金', '股利', '...","[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)),...","[('富邦', 'Nb'), ('金控', 'Na'), ('今', 'Nd'), ('（5...",https://www.nownews.com/news/5796259,https://media.nownews.com/nn_media/thumbnail/2...


In [31]:
df.entities[0]

"[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)), NerToken(word='3.5元', ner='MONEY', idx=(31, 35)), NerToken(word='0.5元', ner='MONEY', idx=(40, 44)), NerToken(word='4.0元', ner='MONEY', idx=(52, 56)), NerToken(word='32%', ner='PERCENT', idx=(77, 80)), NerToken(word='富邦金', ner='ORG', idx=(86, 89)), NerToken(word='2021年', ner='DATE', idx=(116, 121)), NerToken(word='1,445.59億元', ner='MONEY', idx=(132, 142)), NerToken(word='富邦金', ner='ORG', idx=(170, 173)), NerToken(word='去年', ner='DATE', idx=(182, 184)), NerToken(word='新台幣3元', ner='MONEY', idx=(187, 192)), NerToken(word='3.5元', ner='MONEY', idx=(200, 204)), NerToken(word='俄', ner='GPE', idx=(262, 263)), NerToken(word='富邦金控', ner='ORG', idx=(336, 340)), NerToken(word='富邦人壽', ner='ORG', idx=(349, 353)), NerToken(word='155.78億元', ner='MONEY', idx=(355, 363)), NerToken(word='台北富邦銀行', ner='ORG', idx=(364, 370)), NerToken(word='富邦', ner='ORG', idx=(380, 382)), NerToken(word='47.99億元', ner='MONEY', idx=(386, 393)), NerToken(word='富邦', ner='ORG', id

In [32]:
def NerToken(word, ner, idx):
    # print(ner,word)
    return ner,word

In [33]:
NerToken(word='烏克蘭', ner='GPE', idx=(4, 7))

('GPE', '烏克蘭')

In [34]:
eval(df.entities[0])

[('ORG', '富邦金控'),
 ('MONEY', '3.5元'),
 ('MONEY', '0.5元'),
 ('MONEY', '4.0元'),
 ('PERCENT', '32%'),
 ('ORG', '富邦金'),
 ('DATE', '2021年'),
 ('MONEY', '1,445.59億元'),
 ('ORG', '富邦金'),
 ('DATE', '去年'),
 ('MONEY', '新台幣3元'),
 ('MONEY', '3.5元'),
 ('GPE', '俄'),
 ('ORG', '富邦金控'),
 ('ORG', '富邦人壽'),
 ('MONEY', '155.78億元'),
 ('ORG', '台北富邦銀行'),
 ('ORG', '富邦'),
 ('MONEY', '47.99億元'),
 ('ORG', '富邦'),
 ('MONEY', '1.43億元'),
 ('ORG', '富邦人壽'),
 ('ORG', '富邦人壽'),
 ('PERCENT', '5.2%'),
 ('ORG', '富邦'),
 ('ORG', '台北富邦銀行'),
 ('ORG', '北富銀'),
 ('DATE', '2021年'),
 ('PERCENT', '38.5%'),
 ('ORG', '富邦'),
 ('ORG', '富邦'),
 ('ORG', '富邦金'),
 ('MONEY', '0.5元'),
 ('ORG', '富邦金'),
 ('ORG', '金管會'),
 ('CARDINAL', '6'),
 ('CARDINAL', '500億')]

In [35]:
for ner,key in eval(df.entities[0]):
    print(ner,key)

ORG 富邦金控
MONEY 3.5元
MONEY 0.5元
MONEY 4.0元
PERCENT 32%
ORG 富邦金
DATE 2021年
MONEY 1,445.59億元
ORG 富邦金
DATE 去年
MONEY 新台幣3元
MONEY 3.5元
GPE 俄
ORG 富邦金控
ORG 富邦人壽
MONEY 155.78億元
ORG 台北富邦銀行
ORG 富邦
MONEY 47.99億元
ORG 富邦
MONEY 1.43億元
ORG 富邦人壽
ORG 富邦人壽
PERCENT 5.2%
ORG 富邦
ORG 台北富邦銀行
ORG 北富銀
DATE 2021年
PERCENT 38.5%
ORG 富邦
ORG 富邦
ORG 富邦金
MONEY 0.5元
ORG 富邦金
ORG 金管會
CARDINAL 6
CARDINAL 500億


In [36]:
from collections import Counter

CARDINAL	數字
DATE	日期
EVENT	事件
FAC	設施
GPE	行政區
LANGUAGE	語言
LAW	法律
LOC	地理區
MONEY	金錢
NORP	民族、宗教、政治團體
ORDINAL	序數
ORG	組織
PERCENT	百分比率
PERSON	人物
PRODUCT	產品
QUANTITY	數量
TIME	時間
WORK_OF_ART	作品

In [37]:
NE_Name=['CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']

In [38]:
%%time
# It takes time, at least 2 minitues.
top_word_NE=[]
for ne_name in NE_Name:
    allowedNE= [ne_name]
    topwords = get_top_ner_words()
    top_word_NE.append([ne_name, topwords])

CPU times: total: 23.9 s
Wall time: 23.9 s


In [39]:
top_word_NE

[['CARDINAL',
  [('焦點',
    [('之一', 30),
     ('19', 27),
     ('14', 24),
     ('12', 21),
     ('18', 20),
     ('10', 18),
     ('11', 17),
     ('15', 16),
     ('30', 14),
     ('21', 13),
     ('119', 13),
     ('17', 11),
     ('50', 9),
     ('16', 9),
     ('三千', 9),
     ('13', 8),
     ('200', 8),
     ('20萬', 8),
     ('80', 8),
     ('百萬', 8),
     ('60', 7),
     ('1萬', 7),
     ('2000', 7),
     ('42', 7),
     ('120', 7),
     ('36', 7),
     ('100', 7),
     ('40', 7),
     ('27', 7),
     ('數百', 7),
     ('6/10', 7),
     ('25', 6),
     ('1億', 6),
     ('1000', 6),
     ('70', 6),
     ('10萬', 6),
     ('3/4', 6),
     ('77', 6),
     ('3萬', 5),
     ('29', 5),
     ('23', 5),
     ('500', 5),
     ('54', 5),
     ('1.9兆', 5),
     ('20', 5),
     ('200萬', 5),
     ('100萬', 5),
     ('3000', 5),
     ('38', 5),
     ('5.2萬', 5),
     ('300', 4),
     ('7+7', 4),
     ('一半', 4),
     ('192', 4),
     ('800', 4),
     ('400萬', 4),
     ('2019', 4),
     ('58', 4),
    

In [40]:
df_top_word_NE = pd.DataFrame(top_word_NE, columns = ['ne_name','top_keys'])

In [41]:
df_top_word_NE.to_csv('now_news_topkey_by_ner_and_category.csv', sep=',', index=False)