In [1]:
import sys
sys.version_info

sys.version_info(major=3, minor=8, micro=13, releaselevel='final', serial=0)

In [2]:
import re
import os
import nltk
import string
import wget
from collections import Counter  #計算字的次數，用來作詞頻分析

## Common Text Preprocessing


In [4]:
PUNCT_TO_REMOVE = string.punctuation   #內建會移除的標點符號
print("PUNCT_TO_REMOVE:",PUNCT_TO_REMOVE)

text = "Artificial Intelligence (AI), sometimes called machine intelligence, we learning it"
remove_punct_text = text.translate(str.maketrans("","",PUNCT_TO_REMOVE))  #將標點符號用空白取代
print("remove_punct_text:",remove_punct_text)

PUNCT_TO_REMOVE: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
remove_punct_text: Artificial Intelligence AI sometimes called machine intelligence we learning it


## Remove of Stopwords (代詞、代名詞那些會被移除)

In [12]:
##下載 stoppwords
nltk.download('stopwords')

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
print("STOPWORDS:",", ".join(STOPWORDS))

text = "Artificial Intelligence (AI), sometimes called machine intelligence, we learning it"
remove_stopword_text = [word for word in text.split() if word not in STOPWORDS]  #先將句子切割成單詞， 如果不是stopwords就取出來
print("remove_stopword_text:", " ".join(remove_stopword_text))

STOPWORDS: i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yifun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Remove of Frequent words (移除高頻字)

In [15]:
from collections import Counter
num = 3
text = ["Artificial Intelligence (AI), sometimes called machine intelligence".split(),
        "Artificial Intelligence was founded as an academic displine in 1995".split()]  #雙層的list

cnt = Counter([word for sent in text for word in sent])  #從兩層的list中先取出每一句話，再從每一句話中取出每個單詞，再利用Counter做累積次數
print("Counter:",cnt)

FREQWORDS = set([w for (w,wc) in cnt.most_common(num)]) #利用most_common()去讀出出現次數最高的前幾名的詞，(w,wc)代表該單詞與單詞出現的次數
print("FREQWORDS:",FREQWORDS)

remove_fre_text = [[word for word in sent if word not in FREQWORDS] for sent in text]   #從右往左看回去，先取出每個句子，再取出句子中的每個詞，如果不是高頻詞才會抓出來
print("remove_fre_text:",remove_fre_text)

Counter: Counter({'Artificial': 2, 'Intelligence': 2, '(AI),': 1, 'sometimes': 1, 'called': 1, 'machine': 1, 'intelligence': 1, 'was': 1, 'founded': 1, 'as': 1, 'an': 1, 'academic': 1, 'displine': 1, 'in': 1, '1995': 1})
FREQWORDS: {'(AI),', 'Intelligence', 'Artificial'}
remove_fre_text: [['sometimes', 'called', 'machine', 'intelligence'], ['was', 'founded', 'as', 'an', 'academic', 'displine', 'in', '1995']]


## Remove of Rare words

In [18]:
num = 5
text = ["Artificial Intelligence (AI), sometimes called machine intelligence".split(),
        "Artificial Intelligence was founded as an academic displine in 1995".split()]  #雙層的list

cnt = Counter([word for sent in text for word in sent])

RARWORDS = [w for (w, wc) in cnt.most_common()[-num-1:-1]]
print("RARWORDS:",RARWORDS)

remove_rare_text = [[word for word in sent if word not in RARWORDS]for sent in text]
print("remove_rare_text:",remove_rare_text)

RARWORDS: ['as', 'an', 'academic', 'displine', 'in']
remove_rare_text: [['Artificial', 'Intelligence', '(AI),', 'sometimes', 'called', 'machine', 'intelligence'], ['Artificial', 'Intelligence', 'was', 'founded', '1995']]


## Stemming 字根還原

In [20]:
from nltk.stem .porter import PorterStemmer
stemmer = PorterStemmer()  #使用Porter Algo 就可以去創造物件
print("stemmer: ", stemmer)

text = "Artificial Intelligence (AI), sometimes called machine intelligence, we learning it"
stem_text = " ".join([stemmer.stem(word) for word in text.split()])   #針對一句話的每個字做還原字根
print("srem_text: ", stem_text)

stemmer:  <PorterStemmer>
srem_text:  artifici intellig (ai), sometim call machin intelligence, we learn it


## Lemmatization 詞形還原

In [22]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer =  WordNetLemmatizer()

text = "Artificial Intelligence was founded as an academic displine in 1995"
lemma_text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
print("lemma_text:",lemma_text)

lemma_text: Artificial Intelligence wa founded a an academic displine in 1995


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yifun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yifun\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Remove of URLs

In [24]:
url_pattern = re.compile(r"https?://\S+|www\.\S+")

text = "Artificial Intelligence was founded as an academic displine in 1995 at https://zh.wikipedia.org/zh-tw/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD"

remove_url_text = url_pattern.sub(r'',text)
print("Remove_url_text:",remove_url_text)

Remove_url_text: Artificial Intelligence was founded as an academic displine in 1995 at 


## NLP Tool

In [30]:
zh_text = ['人工智慧亦稱智械、機器智慧,指由人製造出來的機器所表現出来的智慧。',
'通常人工智慧是指透過普通電腦程式來现人题智慧的技術。',
'中華郵政未来智慧物流服務,將取之大眾智慧,中華郵政帶給民眾更好的便利生活。']

en_text = ['In 1951, a board of directors was created and premises were rented on Hankou Street in downtown Taipe; to set up the so-called Soochow Preparatory Schoo.',
'The school became the first private university in Taiwan. ']

## Jieba - Chinese text segmentation

使用說明: https://github.com/fxsjy/jieba

In [26]:
import jieba
import jieba.posseg as pseg  #用來做詞性標註的
import wget

In [28]:
#採用平行運算
#僅支援 jieba.dt和 jieba.pogseg.dt
# Windows不支援
# 例如使用4個核心執行
#i.e.
#jieba.enable parallel(4)

#下載繁體中文字典檔和自定義詞典檔
# 下載繁體中文字典檔和自定義詞典檔
wget.download('https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big', 'dict.txt.big')
wget.download('https://raw.githubusercontent.com/fxsjy/jieba/master/test/userdict.txt', 'userdict.txt')

# 載入指定主詞典路徑
jieba.set_dictionary('dict.txt.big')

# 載入指定自定義詞典路徑
jieba.load_userdict('userdict.txt')

# 如果不是在 Windows 平台，且想要使用平行運算，可以取消下一行的註解
# jieba.enable_parallel(4)

Building prefix dict from c:\Users\yifun\Desktop\python_code\dict.txt.big ...
Dumping model to file cache C:\Users\yifun\AppData\Local\Temp\jieba.uea161c6b71afd61d5734afab1a200174.cache
Loading model cost 1.811 seconds.
Prefix dict has been built successfully.


In [37]:
print(" [Paddle 模式]")
J_sents_annotated_ws = [jieba.lcut(sent, use_paddle = True) for sent in zh_text]  #使用深度學習去斷詞
print(J_sents_annotated_ws)       

print("\n [全模式]")
J_sents_annotated_ws = [jieba.lcut(sent, cut_all = True) for sent in zh_text]  
print(J_sents_annotated_ws)     

print("\n [精確模式]")
J_sents_annotated_ws = [jieba.lcut(sent, cut_all = False) for sent in zh_text]  
print(J_sents_annotated_ws)     

print("\n [搜尋引擎模式]")   #切出可能可以用來搜尋的單詞
J_sents_annotated_ws = [jieba.lcut_for_search(sent) for sent in zh_text]  
print(J_sents_annotated_ws)     


 [Paddle 模式]
[['人工智慧', '亦', '稱智械', '、', '機器', '智慧', ',', '指由人', '製造', '出來', '的', '機器', '所', '表現', '出来', '的', '智慧', '。'], ['通常', '人工智慧', '是', '指', '透過', '普通', '電腦程式', '來现', '人题', '智慧', '的', '技術', '。'], ['中華', '郵政', '未来', '智慧', '物流', '服務', ',', '將取', '之大眾', '智慧', ',', '中華', '郵政', '帶給', '民眾', '更好', '的', '便利', '生活', '。']]

 [全模式]
[['人工', '人工智慧', '智慧', '亦', '稱', '智', '械', '、', '機器', '智慧', ',', '指', '由', '人', '製造', '造出', '造出來', '出來', '的', '機器', '所', '表現', '現出', '出来', '的', '智慧', '。'], ['通常', '常人', '人工', '人工智慧', '智慧', '是', '指', '透過', '普通', '通電', '電腦', '電腦程式', '程式', '來', '现', '人', '题', '智慧', '的', '技術', '。'], ['中華', '中華郵', '郵政', '未来', '智慧', '物流', '服務', ',', '將', '取', '之', '大', '眾', '智慧', ',', '中華', '中華郵', '郵政', '帶給', '民', '眾', '更好', '的', '便利', '利生', '生活', '。']]

 [精確模式]
[['人工智慧', '亦', '稱智械', '、', '機器', '智慧', ',', '指由人', '製造', '出來', '的', '機器', '所', '表現', '出来', '的', '智慧', '。'], ['通常', '人工智慧', '是', '指', '透過', '普通', '電腦程式', '來现', '人题', '智慧', '的', '技術', '。'], ['中華', '郵政', '未来', '智慧', '物流', '服務', ',',

In [38]:
#調整字典
jieba.add_word("中華郵政",freq=None, tag=None)

print("\n [精確模式]")
J_sents_annotated_ws = [jieba.lcut(sent, cut_all = False) for sent in zh_text]  
print(J_sents_annotated_ws) 

# 若之後不想要，可以使用del_word
# jieba.del_word('中華郵政')


 [精確模式]
[['人工智慧', '亦', '稱智械', '、', '機器', '智慧', ',', '指由人', '製造', '出來', '的', '機器', '所', '表現', '出来', '的', '智慧', '。'], ['通常', '人工智慧', '是', '指', '透過', '普通', '電腦程式', '來现', '人题', '智慧', '的', '技術', '。'], ['中華郵政', '未来', '智慧', '物流', '服務', ',', '將取', '之大眾', '智慧', ',', '中華郵政', '帶給', '民眾', '更好', '的', '便利', '生活', '。']]


#### 詞性標註

In [40]:
print('[paddle 模式]')
J_sents_annotated_pos = [pseg.lcut(sent, use_paddle=True) for sent in zh_text]
J_sents_annotated_pos

[paddle 模式]


[[pair('人工智慧', 'l'),
  pair('亦', 'd'),
  pair('稱', 'v'),
  pair('智械', 'n'),
  pair('、', 'x'),
  pair('機器', 'x'),
  pair('智慧', 'nr'),
  pair(',', 'x'),
  pair('指由人', 'n'),
  pair('製造', 'x'),
  pair('出來', 'x'),
  pair('的', 'uj'),
  pair('機器', 'x'),
  pair('所', 'c'),
  pair('表現', 'x'),
  pair('出来', 'v'),
  pair('的', 'uj'),
  pair('智慧', 'nr'),
  pair('。', 'x')],
 [pair('通常', 'd'),
  pair('人工智慧', 'l'),
  pair('是', 'v'),
  pair('指', 'n'),
  pair('透過', 'x'),
  pair('普通', 'nz'),
  pair('電腦程式', 'x'),
  pair('來现', 'v'),
  pair('人题', 'n'),
  pair('智慧', 'nr'),
  pair('的', 'uj'),
  pair('技術', 'x'),
  pair('。', 'x')],
 [pair('中華郵政', 'x'),
  pair('未来', 't'),
  pair('智慧', 'nr'),
  pair('物流', 'n'),
  pair('服務', 'x'),
  pair(',', 'x'),
  pair('將', 'd'),
  pair('取', 'v'),
  pair('之', 'u'),
  pair('大眾', 'n'),
  pair('智慧', 'nr'),
  pair(',', 'x'),
  pair('中華郵政', 'x'),
  pair('帶給', 'x'),
  pair('民眾', 'n'),
  pair('更好', 'd'),
  pair('的', 'uj'),
  pair('便利', 'a'),
  pair('生活', 'vn'),
  pair('。', 'x')]]

In [41]:
# 重新排版

print('INDEX\tTEXT\tPOS')
for sent in J_sents_annotated_pos:
    for i,(word,pos) in enumerate(sent):
        print("{}\t{}\t{}".format(i,word,pos))
    print("")

INDEX	TEXT	POS
0	人工智慧	l
1	亦	d
2	稱	v
3	智械	n
4	、	x
5	機器	x
6	智慧	nr
7	,	x
8	指由人	n
9	製造	x
10	出來	x
11	的	uj
12	機器	x
13	所	c
14	表現	x
15	出来	v
16	的	uj
17	智慧	nr
18	。	x

0	通常	d
1	人工智慧	l
2	是	v
3	指	n
4	透過	x
5	普通	nz
6	電腦程式	x
7	來现	v
8	人题	n
9	智慧	nr
10	的	uj
11	技術	x
12	。	x

0	中華郵政	x
1	未来	t
2	智慧	nr
3	物流	n
4	服務	x
5	,	x
6	將	d
7	取	v
8	之	u
9	大眾	n
10	智慧	nr
11	,	x
12	中華郵政	x
13	帶給	x
14	民眾	n
15	更好	d
16	的	uj
17	便利	a
18	生活	vn
19	。	x



## 使用Stanza 套件

In [42]:
import stanza
stanza.download('en')   # download English model
stanza.download('zh-hant')  # download Chinese model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-19 13:52:09 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/default.zip:   0%|          | 0…

2023-10-19 13:53:53 INFO: Finished downloading models and saved to C:\Users\yifun\stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-19 13:53:54 INFO: Downloading default packages for language: zh-hant (Traditional_Chinese) ...


Downloading https://huggingface.co/stanfordnlp/stanza-zh-hant/resolve/v1.6.0/models/default.zip:   0%|        …

2023-10-19 13:54:26 INFO: Finished downloading models and saved to C:\Users\yifun\stanza_resources.


In [43]:
#Initialize Chinese annotator
stanza_zh_annotator = stanza.Pipeline(lang='zh-hant',processors='tokenize,lemma,pos,depparse',use_gpu=True)
                                            # 這邊要告訴標註器要做: 斷詞、還原字根、詞性標註、結構樹     #用GPU來加速

#Initialize English annotator
stanza_en_annotator = stanza.Pipeline(lang='en',processors='tokenize,lemma,pos,depparse',use_gpu=True)

2023-10-19 13:55:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-19 13:55:32 INFO: Loading these models for language: zh-hant (Traditional_Chinese):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| pos       | gsd_nocharlm |
| lemma     | gsd_nocharlm |
| depparse  | gsd_nocharlm |

2023-10-19 13:55:32 INFO: Using device: cpu
2023-10-19 13:55:32 INFO: Loading: tokenize
2023-10-19 13:55:33 INFO: Loading: pos
2023-10-19 13:55:33 INFO: Loading: lemma
2023-10-19 13:55:33 INFO: Loading: depparse
2023-10-19 13:55:33 INFO: Done loading processors!
2023-10-19 13:55:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-19 13:55:34 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2023-10-19 13:55:34 INFO: Using device: cpu
2023-10-19 13:55:34 INFO: Loading: tokenize
2023-10-19 13:55:34 INFO: Loading: pos
2023-10-19 13:55:34 INFO: Loading: lemma
2023-10-19 13:55:34 INFO: Loading: depparse
2023-10-19 13:55:34 INFO: Done loading processors!


### 斷詞、詞性標註、句法分析

In [47]:
def show(doc):
    for i, sent in enumerate(doc.sentences):  # 讀出每一句話
        print("Sentence {}\nINDEX\tTEXT\tLEMMA\tPOS\tDepRel\tHeadId".format(i+1))
        for j, word in enumerate(sent.words):  # 將每句話的每個字，依序顯示出來
            print("{}\t{}\t{}\t{}\t{}\t{}".format(j,word.text,word.lemma,word.pos,word.deprel,word.head))
        print("")

In [48]:
# 利用中文斷詞器進行分析
print("S_sents_annotated_ws_pos_deprel_zh:")
S_sents_annotated_ws_pos_deprel_zh = [stanza_zh_annotator(sent) for sent in zh_text]
for sent in S_sents_annotated_ws_pos_deprel_zh:
    show(sent)

S_sents_annotated_ws_pos_deprel_zh:
Sentence 1
INDEX	TEXT	LEMMA	POS	DepRel	HeadId
0	人工	人工	NOUN	nmod	2
1	智慧	智慧	NOUN	nsubj	4
2	亦	亦	ADV	mark	4
3	稱	稱	VERB	root	0
4	智械	智械	NOUN	obj	4
5	、	、	PUNCT	punct	8
6	機器	機器	NOUN	nmod	8
7	智慧	智慧	NOUN	conj	5
8	,	,	ADV	advmod	10
9	指	指	VERB	ccomp	4
10	由	由	ADP	case	12
11	人	人	NOUN	obl	13
12	製造	製造	VERB	acl:relcl	16
13	出來	出來	VERB	mark	13
14	的	的	PART	mark:rel	13
15	機器	機器	NOUN	nsubj	18
16	所	所	ADV	mark	18
17	表現	表現	VERB	acl:relcl	21
18	出来	出来	VERB	acl:relcl	21
19	的	的	PART	mark:rel	19
20	智慧	智慧	NOUN	obj	10
21	。	。	PUNCT	punct	10

Sentence 1
INDEX	TEXT	LEMMA	POS	DepRel	HeadId
0	通常	通常	ADJ	amod	3
1	人工	人工	NOUN	nmod	3
2	智慧	智慧	NOUN	nsubj	4
3	是	是	VERB	root	0
4	指	指	VERB	xcomp	4
5	透過	透過	ADP	case	9
6	普通	普通	ADJ	amod	9
7	電腦	電腦	NOUN	nmod	9
8	程式	程式	NOUN	obl	11
9	來	來	ADV	mark	11
10	现	现	VERB	xcomp	5
11	人	人	NOUN	nsubj	13
12	题	题	VERB	acl:relcl	16
13	智慧	智慧	NOUN	obj	13
14	的	的	PART	mark:rel	13
15	技術	技術	NOUN	obj	5
16	。	。	PUNCT	punct	4

Sentence 1
INDEX	TEXT	LEMMA	POS	DepRel	HeadId
0	中華	中華	PRO

In [49]:
# 利用英文斷詞器進行分析
print("S_sents_annotated_ws_pos_deprel_en:")
S_sents_annotated_ws_pos_deprel_en = [stanza_en_annotator(sent) for sent in en_text]
for sent in S_sents_annotated_ws_pos_deprel_en:
    show(sent)

S_sents_annotated_ws_pos_deprel_en:
Sentence 1
INDEX	TEXT	LEMMA	POS	DepRel	HeadId
0	In	in	ADP	case	2
1	1951	1951	NUM	obl	9
2	,	,	PUNCT	punct	9
3	a	a	DET	det	5
4	board	board	NOUN	nsubj:pass	9
5	of	of	ADP	case	7
6	directors	director	NOUN	nmod	5
7	was	be	AUX	aux:pass	9
8	created	create	VERB	root	0
9	and	and	CCONJ	cc	13
10	premises	premise	NOUN	nsubj:pass	13
11	were	be	AUX	aux:pass	13
12	rented	rent	VERB	conj	9
13	on	on	ADP	case	16
14	Hankou	Hankou	PROPN	compound	16
15	Street	Street	PROPN	obl	13
16	in	in	ADP	case	19
17	downtown	downtown	ADJ	amod	19
18	Taipe	Taipe	PROPN	nmod	16
19	;	;	PUNCT	punct	22
20	to	to	PART	mark	22
21	set	set	VERB	advcl	13
22	up	up	ADP	compound:prt	22
23	the	the	DET	det	30
24	so	so	ADV	advmod	27
25	-	-	PUNCT	punct	27
26	called	call	VERB	amod	30
27	Soochow	Soochow	PROPN	compound	30
28	Preparatory	Preparatory	ADJ	amod	30
29	Schoo	Schoo	PROPN	obj	22
30	.	.	PUNCT	punct	9

Sentence 1
INDEX	TEXT	LEMMA	POS	DepRel	HeadId
0	The	the	DET	det	2
1	school	school	NOUN	nsubj	3
2	beca

## NER 命名實體辨識

In [50]:
# Initialize English annotator for NER
stanza_en_annotator = stanza.Pipeline(lang='en',processors="tokenize,ner",use_gpu=True)

2023-10-19 14:20:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-19 14:20:56 INFO: Loading these models for language: en (English):
| Processor | Package          |
--------------------------------
| tokenize  | combined         |
| ner       | ontonotes_charlm |

2023-10-19 14:20:56 INFO: Using device: cpu
2023-10-19 14:20:56 INFO: Loading: tokenize
2023-10-19 14:20:56 INFO: Loading: ner
2023-10-19 14:20:57 INFO: Done loading processors!


In [51]:
def show_ner(doc):
    for i, sent in enumerate(doc.sentences):  # 讀出每一句話
        print("Sentence {}\nID\tTEXT\tNER".format(i+1))
        for j, token in enumerate(sent.tokens):  # 將每句話的每個字，依序顯示出來
            print("{}\t{}\t{}".format(j,token.text,token.ner))
        print("")

In [52]:
# 利用英文斷詞器進行分析
print("S_sents_annotated_NER_en:")
S_sents_annotated_NER_en = [stanza_en_annotator(sent) for sent in en_text]
for sent in S_sents_annotated_NER_en:
    show_ner(sent)

S_sents_annotated_NER_en:
Sentence 1
ID	TEXT	NER
0	In	O
1	1951	S-DATE
2	,	O
3	a	O
4	board	O
5	of	O
6	directors	O
7	was	O
8	created	O
9	and	O
10	premises	O
11	were	O
12	rented	O
13	on	O
14	Hankou	B-FAC
15	Street	E-FAC
16	in	O
17	downtown	O
18	Taipe	S-GPE
19	;	O
20	to	O
21	set	O
22	up	O
23	the	O
24	so	O
25	-	O
26	called	O
27	Soochow	B-ORG
28	Preparatory	I-ORG
29	Schoo	E-ORG
30	.	O

Sentence 1
ID	TEXT	NER
0	The	O
1	school	O
2	became	O
3	the	O
4	first	S-ORDINAL
5	private	O
6	university	O
7	in	O
8	Taiwan	S-GPE
9	.	O

