# Stanza

In [32]:
import stanza
from stanza.utils.resources import DEFAULT_MODEL_DIR
print(DEFAULT_MODEL_DIR)

/Users/yuni/stanza_resources


In [33]:
#Download English model
# stanza.download('en')
#Download traditional Chinese model
# stanza.download('zh-hant')

In [34]:
#texts 
en_doc = 'On April 3, the Central Epidemic Command Center (CECC) reported that that 827 additional cases related to coronavirus disease 2019 (COVID-19) were reported on April 2.'
zh_doc = '中央流行疫情指揮中心今(3)日表示，昨(2)日國內新增827例新型冠狀病毒肺炎相關通報， 截至目前累計34,557例(含30,530例排除)，其中348例確診(今日新增案340至348)，分別為 298例境外移入及50例本土病例。'

In [35]:
# initialize English neural pipeline
en_nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma,pos', use_gpu=False)
# initialize Chinese neural pipeline
zh_nlp = stanza.Pipeline(lang='zh-hant', processors='tokenize,lemma,pos', use_gpu=False)

2020-04-12 20:53:45 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-04-12 20:53:45 INFO: Use device: cpu
2020-04-12 20:53:45 INFO: Loading: tokenize
2020-04-12 20:53:45 INFO: Loading: pos
2020-04-12 20:53:46 INFO: Loading: lemma
2020-04-12 20:53:46 INFO: Done loading processors!
2020-04-12 20:53:46 INFO: Loading these models for language: zh-hant (Traditional_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2020-04-12 20:53:46 INFO: Use device: cpu
2020-04-12 20:53:46 INFO: Loading: tokenize
2020-04-12 20:53:46 INFO: Loading: pos
2020-04-12 20:53:48 INFO: Loading: lemma
2020-04-12 20:53:48 INFO: Done loading processors!


In [36]:
#run annotation over a English sentence
en_annotated_doc = en_nlp(en_doc)
#run annotation over a Chinese sentence
zh_annotated_doc = zh_nlp(zh_doc)

In [53]:
fin = []
for i, sent in enumerate(zh_annotated_doc.sentences):
    for j, word in enumerate(sent.words):
        fin.append(format(word.text))
print(fin)

['中央', '流行', '疫情', '指揮', '中心', '今(3)', '日', '表示', '，', '昨(', '2)', '日國內', '新增', '827', '例', '新型', '冠狀', '病毒', '肺炎', '相關', '通報', '，', '截至', '目前', '累計', '34,557', '例(含30,530', '例排除', ')', '，', '其中', '348', '例確診(', '今', '日新', '增案', '340', '至', '348)', '，', '分別', '為', '298', '例', '境外', '移入', '及', '50', '例', '本土', '病例', '。']


In [42]:
def show(doc):
    for i, sent in enumerate(doc.sentences):
        print("Sentence {}\nINDEX\tTEXT\tLEMMA\tPOS",format(i+1))
        for j, word in enumerate(sent.words):
            print ("{}\t{}\t{}\t{}".format(j,word.text,word.lemma,word.pos))
            
        #print("")
        
print('English annotation')
show(en_annotated_doc)
print('Chinese annotation')
show(zh_annotated_doc)

English annotation
Sentence {}
INDEX	TEXT	LEMMA	POS 1
0	On	on	ADP
1	April	April	PROPN
2	3	3	NUM
3	,	,	PUNCT
4	the	the	DET
5	Central	Central	PROPN
6	Epidemic	Epidemic	PROPN
7	Command	Command	PROPN
8	Center	Center	PROPN
9	(	(	PUNCT
10	CECC	CECC	PROPN
11	)	)	PUNCT
12	reported	report	VERB
13	that	that	SCONJ
14	that	that	SCONJ
15	827	827	NUM
16	additional	additional	ADJ
17	cases	case	NOUN
18	related	related	ADJ
19	to	to	ADP
20	coronavirus	coronavirus	NOUN
21	disease	disease	NOUN
22	2019	2019	NUM
23	(	(	PUNCT
24	COVID	COVID	PROPN
25	-	-	SYM
26	19	19	NUM
27	)	)	PUNCT
28	were	be	AUX
29	reported	report	VERB
30	on	on	ADP
31	April	April	PROPN
32	2	2	NUM
33	.	.	PUNCT
Chinese annotation
Sentence {}
INDEX	TEXT	LEMMA	POS 1
0	中央	中央	NOUN
1	流行	流行	ADJ
2	疫情	疫情	NOUN
3	指揮	指揮	NOUN
4	中心	中心	NOUN
5	今(3)	今(3)	X
6	日	日	NOUN
7	表示	表示	VERB
8	，	，	PUNCT
9	昨(	昨(	NOUN
10	2)	2)	NUM
11	日國內	日國內	NOUN
12	新增	新增	VERB
13	827	827	NUM
14	例	例	NOUN
15	新型	新型	NOUN
16	冠狀	冠狀	NOUN
17	病毒	病毒	NOUN
18	肺炎	肺炎	NOUN
19	相關	相關	ADJ
20	通報	通報	NOUN
21

# Ckip Tagger

In [2]:
import os
from pathlib import Path
from ckiptagger import data_utils
# path = os.path.join(str(Path.home()), 'ckip/')
# if not os.path.exists(path): os.mkdir(path)
# data_utils.download_data_gdown(path)

In [4]:
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
from pathlib import Path
import os

In [5]:
import tensorflow as tf

In [6]:
# download traditional Chinese model
path = str(Path.home())+'/ckip'
if not os.path.exists(path):
    os.mkdir(path)
    data_utils.download_data_gdown(path) # gdrive-ckip 2GB

In [7]:
#texts 
zh_doc = '中央流行疫情指揮中心今(3)日表示，昨(2)日國內新增827例新型冠狀病毒肺炎相關通報， 截至目前累計34,557例(含30,530例排除)，其中348例確診(今日新增案340至348)，分別為 298例境外移入及50例本土病例。'

In [8]:
# initialize Chinese neural pipeline
zh_ws = WS(path+"/data")
zh_pos = POS(path+"/data")

In [9]:
# run annotation over a Chinese sentence
zh_annotated_ws = zh_ws([zh_doc])
zh_annotated_pos = zh_pos(zh_annotated_ws)

In [29]:
print('Chinese annotation')
for i, sentence in enumerate(zip(zh_annotated_ws,zh_annotated_pos)):
    print('sentence {}:' .format(i))
    print('INDEX\tTEXT\tPOS')
    for j,word in enumerate(zip(sentence[0], sentence[1])):
        print('{}\t{}\t{}'.format(j, word[0], word[1]))
        #print(list(word[0]))

Chinese annotation
sentence 0:
INDEX	TEXT	POS
0	中央	Nc
1	流行	VH
2	疫情	Na
3	指揮	VC
4	中心	Nc
5	今	Nd
6	(3)	Neu
7	日	Nd
8	表示	VE
9	，	COMMACATEGORY
10	昨	Nd
11	(2)	Neu
12	日	Nd
13	國內	Nc
14	新增	VJ
15	827	Neu
16	例	Na
17	新型	Na
18	冠狀	Na
19	病毒	Na
20	肺炎	Na
21	相關	VH
22	通報	VE
23	，	COMMACATEGORY
24	 	WHITESPACE
25	截至	P
26	目前	Nd
27	累計	VJ
28	34,557	Neu
29	例	Na
30	(	PARENTHESISCATEGORY
31	含	VJ
32	30,530	Neu
33	例	Na
34	排除	VC
35	)	PARENTHESISCATEGORY
36	，	COMMACATEGORY
37	其中	Nep
38	348	Neu
39	例	Na
40	確診	VA
41	(	PARENTHESISCATEGORY
42	今日	Nd
43	新增案	Na
44	340	Neu
45	至	Caa
46	348	Neu
47	)	PARENTHESISCATEGORY
48	，	COMMACATEGORY
49	分別	D
50	為	VG
51	 298	FW
52	例	Na
53	境	Na
54	外	Ncd
55	移入	VC
56	及	Caa
57	50	Neu
58	例	Na
59	本土	Nc
60	病例	Na
61	。	PERIODCATEGORY


In [27]:
print('Chinese annotation')
res=[]
for i, sentence in enumerate(zip(zh_annotated_ws,zh_annotated_pos)):
    for j,word in enumerate(zip(sentence[0], sentence[1])):
        res.append(format(word[0]))
        #print(list(word[0]))
    print(res)

Chinese annotation
['中央', '流行', '疫情', '指揮', '中心', '今', '(3)', '日', '表示', '，', '昨', '(2)', '日', '國內', '新增', '827', '例', '新型', '冠狀', '病毒', '肺炎', '相關', '通報', '，', ' ', '截至', '目前', '累計', '34,557', '例', '(', '含', '30,530', '例', '排除', ')', '，', '其中', '348', '例', '確診', '(', '今日', '新增案', '340', '至', '348', ')', '，', '分別', '為', ' 298', '例', '境', '外', '移入', '及', '50', '例', '本土', '病例', '。']


In [None]:
import json

In [None]:
with open("/Users/yuni/cts_0301_0403_news.json", 'r', encoding='utf-8') as f:
    print(type(f))
    res = json.load(f)
    #讀取json大量資料錯誤

In [None]:
with open("/Users/yuni/cts_0301_0403_news.json", 'r', encoding='utf-8') as f:
    line = f.readline()
    print(type(line))
    res = json.loads(line)

In [None]:
file = open("/Users/yuni/cts_0301_0403_news.json", 'r', encoding='utf-8')
papers = []
for line in file.readlines():
    dic = json.loads(line)
    papers.append(dic)
    
data_list = []
for i in papers:
    data_list.append(i["title"])
    data_list.append(i["content"])
    
data = str()
for i in data_list:
    data = str(data) + str(i)

In [None]:
print(len(papers))

In [None]:
import jieba
import jieba.posseg as pseg
import os
import urllib

In [None]:
# downLoad traditional Chinese dictionary
URL = 'https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big'
if not os.path.exists(os .path.join(os.getcwd(), 'dict.txt.big')):
    urllib.request.urlretrieve(URL, 'dict.txt.big')
#jieba. set_dictionary( 'dict.txt.big')
#jieba. Load_userdict( 'userdict.txt')
#jieba.add_word('國内')

In [None]:
# run annotation over a Chinese sentence
# with POS tag
zh_annotated_pos = list(pseg.cut(zh_doc))

In [None]:
print('INDEXItTEXTItPOS')
for i,(word,pos) in enumerate(zh_annotated_pos):
    print('{}\t{}\t{}'.format(i, word, pos))

In [14]:
#texts 
zh_doc = '中央流行疫情指揮中心今(3)日表示，昨(2)日國內新增827例新型冠狀病毒肺炎相關通報， 截至目前累計34,557例(含30,530例排除)，其中348例確診(今日新增案340至348)，分別為 298例境外移入及50例本土病例。'

In [15]:
# run annotation over a Chinese sentence
# accurate pattern
zh_annotated = list(jieba.cut(zh_doc, cut_all=False))

NameError: name 'jieba' is not defined

In [28]:
print('Chinese annotation')
print( 'INDEXItTEXT')
for j,word in enumerate(zh_annotated):
    print('{}\t{}'.format(j, word))

Chinese annotation
INDEXItTEXT


NameError: name 'zh_annotated' is not defined