In [1]:
import subprocess

# 構建命令
command = [
    "java", "-mx1000m", "-cp", "stanford-ner.jar;lib/*", 
    "edu.stanford.nlp.ie.crf.CRFClassifier", 
    "-loadClassifier", "classifiers/english.all.3class.distsim.crf.ser.gz", 
    "-textFile", "sample.txt"
]

# 執行命令並捕獲輸出
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# 打印標記結果
print(result.stdout)


The/O fate/O of/O Lehman/ORGANIZATION Brothers/ORGANIZATION ,/O the/O beleaguered/O investment/O bank/O ,/O hung/O in/O the/O balance/O on/O Sunday/O as/O Federal/ORGANIZATION Reserve/ORGANIZATION officials/O and/O the/O leaders/O of/O major/O financial/O institutions/O continued/O to/O gather/O in/O emergency/O meetings/O trying/O to/O complete/O a/O plan/O to/O rescue/O the/O stricken/O bank/O ./O 
Several/O possible/O plans/O emerged/O from/O the/O talks/O ,/O held/O at/O the/O Federal/ORGANIZATION Reserve/ORGANIZATION Bank/ORGANIZATION of/ORGANIZATION New/ORGANIZATION York/ORGANIZATION and/O led/O by/O Timothy/PERSON R./PERSON Geithner/PERSON ,/O the/O president/O of/O the/O New/ORGANIZATION York/ORGANIZATION Fed/ORGANIZATION ,/O and/O Treasury/ORGANIZATION Secretary/O Henry/PERSON M./PERSON Paulson/PERSON Jr./PERSON ./O 



In [2]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# 設置 Stanford NER jar 和模型的路徑
jar = 'stanford-ner.jar'
model = 'classifiers/english.all.3class.distsim.crf.ser.gz'

# 初始化 Stanford NER Tagger
stanford_ner_tagger = StanfordNERTagger(model, jar, encoding='utf-8')

# 讀取 sample.txt 文件
with open("sample.txt", "r", encoding="utf-8") as file:
    text = file.read()

# 分詞並進行 NER 標記
words = word_tokenize(text)
tagged_words = stanford_ner_tagger.tag(words)

# 顯示結果
for word, tag in tagged_words:
    print(f"{word}: {tag}")


The: O
fate: O
of: O
Lehman: ORGANIZATION
Brothers: ORGANIZATION
,: O
the: O
beleaguered: O
investment: O
bank: O
,: O
hung: O
in: O
the: O
balance: O
on: O
Sunday: O
as: O
Federal: ORGANIZATION
Reserve: ORGANIZATION
officials: O
and: O
the: O
leaders: O
of: O
major: O
financial: O
institutions: O
continued: O
to: O
gather: O
in: O
emergency: O
meetings: O
trying: O
to: O
complete: O
a: O
plan: O
to: O
rescue: O
the: O
stricken: O
bank: O
.: O
Several: O
possible: O
plans: O
emerged: O
from: O
the: O
talks: O
,: O
held: O
at: O
the: O
Federal: ORGANIZATION
Reserve: ORGANIZATION
Bank: ORGANIZATION
of: ORGANIZATION
New: ORGANIZATION
York: ORGANIZATION
and: O
led: O
by: O
Timothy: PERSON
R.: PERSON
Geithner: PERSON
,: O
the: O
president: O
of: O
the: O
New: ORGANIZATION
York: ORGANIZATION
Fed: ORGANIZATION
,: O
and: O
Treasury: ORGANIZATION
Secretary: O
Henry: PERSON
M.: PERSON
Paulson: PERSON
Jr: PERSON
.: O


In [3]:
import pandas as pd

# 將標記結果存入 DataFrame
tagged_df = pd.DataFrame(tagged_words, columns=["Word", "Entity"])

# 顯示 DataFrame
tagged_df


Unnamed: 0,Word,Entity
0,The,O
1,fate,O
2,of,O
3,Lehman,ORGANIZATION
4,Brothers,ORGANIZATION
...,...,...
80,Henry,PERSON
81,M.,PERSON
82,Paulson,PERSON
83,Jr,PERSON


In [4]:
# 過濾出非 "O" 的命名實體
entities_df = tagged_df[tagged_df["Entity"] != "O"]

# 顯示特定實體類型
entities_df


Unnamed: 0,Word,Entity
3,Lehman,ORGANIZATION
4,Brothers,ORGANIZATION
18,Federal,ORGANIZATION
19,Reserve,ORGANIZATION
56,Federal,ORGANIZATION
57,Reserve,ORGANIZATION
58,Bank,ORGANIZATION
59,of,ORGANIZATION
60,New,ORGANIZATION
61,York,ORGANIZATION


In [5]:
# 保存 DataFrame 為 CSV 文件
tagged_df.to_csv("ner_tagged_output.csv", index=False)
