In [1]:
!pip install -U spacy[cuda110,transformers,lookups]==3.0.3
!pip install -U spacy-lookups-data==1.0.0
!pip install cupy-cuda110==8.5.0
!python -m spacy download en_core_web_trf

2025-03-04 21:30:14.838804: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting en-core-web-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m459.7/459.7 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [2]:
# Import spacy and download language model
import spacy
nlp = spacy.load('en_core_web_trf')

2025-03-04 21:31:05.857855: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# Tokenization
sentence = nlp.tokenizer("We live in Paris.")

# Length of sentence
print("The number of tokens: ", len(sentence))

# Print individual words (i.e., tokens)
print("The tokens: ")
for words in sentence:
    print(words)

The number of tokens:  5
The tokens: 
We
live
in
Paris
.


In [4]:
!pip install kagglehub



## NLP Tasks
1. Tokenization (토큰화)\
    문장을 단어, 구두점, 기호 등으로 분할\
    토큰화 완료 후 토큰에 메타데이터(품사 태깅) 할당하여 다른 요소들과의 관계를 고려하여 각 토큰을 어떻게 처리할지 결정\

2. Part-of-Speech Tagging (품사 태깅)
    각 토큰에 대해 명사, 대명사, 동사, 부사, … 등의 품사를 할당 \
    메타데이터란 바로 이 품사를 말하는것. 단어들간의 관계를 파악\

3. Dependency Parsing (의존구문분석)
    문장→ 토큰 간 관계 집합 으로 변환. 문장의 문법적 구조를 결정\
    We live in Paris\
    We : 대명사, live의 주어\
    live : 동사, 현재형\
    in Paris : 전치사구\
    in : 전치사\
    Paris : 전치사의 목적어, 단수 고유 명사\

4. Chunk (덩어리)
    서로 관련된 토큰을 하나로 묶어 명사구, 동사구 등의 덩어리를 생성 (ex. New York city)\
    토큰화 → 품사태깅 → 의존구문분석 → 청킹\
    청킹을 통해 분석해야할 토큰 수가 줄어들어 더 빠르고 정확하게 NLP 작업 가능\

5. Lemmatizaiton (표제어 추출)
    단어를 기본형(lemma)으로 변환\
    horses → horse, slept → sleep, …\
    동일한 의미의 단어의 다양한 변형을 하나로 통합 가능. 더 적은 수의 단어로 언어 이해 가능\

6. Stemming (어간 추출)
    표제어 추출과 비슷. rule-based algorithm으로 단어를 기본형태로 변환\
    sleeping → sleep, slept → 불규칙동사 처리 불가\
    그러나 running → runn 같이 비정상적인 변환 발생 가능성 있음\
    어간추출보다 표제어 추출이 더 정확하지만 품사 정보를 고려하므로 계산 비용이 든다.\

7. Named Entity Recognition (중요 개체 인식)
    텍스트에서 사람, 조직, 장소, 날짜, 화폐 단위 등의 고유명사 식별, 태깅하는 작업\
    Paris → 지명(location)\

8. Entity Linking (개체 연결)
    텍스트 내 개체를 외부 DB와 연결 & 중복, 혼동될 수 있는 개체를 올바르게 식별\
    가령 Bush 라고 했을때 아버지 부시인지 아들 부시인지 개체를 정확하게 연결해야함\

1~6 (토큰화~어간추출)은 NLP 시스템이 텍스트를 이해하기 위한 기본 단계\
7~8 (NER, 개체 연결)은 NLP 시스템이 정보를 검색, 활용할 수 있는 고급 기술


In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tunguz/200000-jeopardy-questions")

print("Path to dataset files:", path)

Path to dataset files: /home/ajh1729/.cache/kagglehub/datasets/tunguz/200000-jeopardy-questions/versions/1


In [16]:
import pandas as pd
import os

# Import Jeopardy Questions
data = pd.read_csv('/home/ajh1729/.cache/kagglehub/datasets/tunguz/200000-jeopardy-questions/versions/1/JEOPARDY_CSV.csv')
data = pd.DataFrame(data=data)

# Lowercase, strip whitespace, and view column names
data.columns = map(lambda x: x.lower().strip(), data.columns)

# Reduce size of data
data = data[0:1000]

# Tokenize Jeopardy Questions
data["question_tokens"] = data["question"].apply(lambda x: nlp(x))

In [20]:
# View first question
example_question = data.question[0]
example_question_tokens = data.question_tokens[0]
print("The first questions is:")
print(example_question)

The first questions is:
For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory


In [21]:
# Print individual tokens of first question
print("The tokens from the first question are:")
for tokens in example_question_tokens:
    print(tokens)

The tokens from the first question are:
For
the
last
8
years
of
his
life
,
Galileo
was
under
house
arrest
for
espousing
this
man
's
theory


In [22]:
# Print Part-of-speech tags for tokens in the first question
print("Here are the Part-of-speech tags for each token in the first question:")
for token in example_question_tokens:
    print(token.text,token.pos_, spacy.explain(token.pos_))

Here are the Part-of-speech tags for each token in the first question:
For ADP adposition
the DET determiner
last ADJ adjective
8 NUM numeral
years NOUN noun
of ADP adposition
his PRON pronoun
life NOUN noun
, PUNCT punctuation
Galileo PROPN proper noun
was AUX auxiliary
under ADP adposition
house NOUN noun
arrest NOUN noun
for ADP adposition
espousing VERB verb
this DET determiner
man NOUN noun
's PART particle
theory NOUN noun


In [23]:
# Print Dependency Parsing tags for tokens in the first question
for token in example_question_tokens:
    print(token.text,token.dep_, spacy.explain(token.dep_))

For prep prepositional modifier
the det determiner
last amod adjectival modifier
8 nummod numeric modifier
years pobj object of preposition
of prep prepositional modifier
his poss possession modifier
life pobj object of preposition
, punct punctuation
Galileo nsubj nominal subject
was ROOT None
under prep prepositional modifier
house compound compound
arrest pobj object of preposition
for prep prepositional modifier
espousing pcomp complement of preposition
this det determiner
man poss possession modifier
's case case marking
theory dobj direct object


# spaCy, fast.ai, and Hugging face
 오픈 소스 소프트웨어 라이브러리

1. spaCy\
    2015 출시. 2001년 출시된 NLTK(Natural Language Toolkit)이후 대규모 데이터 처리를 염두에 둔 라이브러리\
    트랜스포머 기반 모델 지원, 데이터 라벨링 및 주석 위한 Prodigy라는 도구 제공
    
2. fast.ai\
    Pytorch 기반 오픈 소스 라이브러리\
    사용하기 쉽고 학습에 적합하지만 spaCy, Hugging face 보단 덜 성숙?함
    * mature : 대중 친화적인 라이브러리
    
3. Hugging face\
    NLP 특화 기업\
    transformer, Pytorch, Tensorflow 기반 라이브러리 지원

In [24]:
# Visualize the dependency parse
from spacy import displacy

displacy.render(example_question_tokens, style='dep',
                jupyter=True, options={'distance': 120})

In [25]:
# Print tokens for example sentence without chunking
for token in nlp("My parents live in New York City."):
    print(token.text)

My
parents
live
in
New
York
City
.


In [26]:
for chunk in nlp("My parents live in New York City.").noun_chunks:
      print(chunk.text)

My parents
New York City


In [27]:
# Print lemmatization for tokens in the first question
lemmatization = pd.DataFrame(data=[], \
  columns=["original","lemmatized"])
i = 0
for token in example_question_tokens:
    lemmatization.loc[i,"original"] = token.text
    lemmatization.loc[i,"lemmatized"] = token.lemma_
    i = i+1

lemmatization

Unnamed: 0,original,lemmatized
0,For,for
1,the,the
2,last,last
3,8,8
4,years,year
5,of,of
6,his,his
7,life,life
8,",",","
9,Galileo,Galileo


In [28]:
# Print NER results
example_sentence = "George Washington was an American political leader, \
military general, statesman, and Founding Father who served as the \
first president of the United States from 1789 to 1797.\n"

print(example_sentence)

print("Text Start End Label")
doc = nlp(example_sentence)
for token in doc.ents:
    print(token.text, token.start_char, token.end_char, token.label_)

George Washington was an American political leader, military general, statesman, and Founding Father who served as the first president of the United States from 1789 to 1797.

Text Start End Label
George Washington 0 17 PERSON
American 25 33 NORP
first 119 124 ORDINAL
the United States 138 155 GPE
from 1789 to 1797 156 173 DATE


In [29]:
# Visualize NER results
displacy.render(doc, style='ent', jupyter=True, options={'distance': 120})

In [31]:
# Import libraries
import requests

# Define Google Knowledge Graph API Result function
def returnGraphResult(query, key, entityType):
    if entityType=="PERSON":
        google = f"https://kgsearch.googleapis.com/v1/entities:search\
         ?query={query}&key={key}"
        resp = requests.get(google)
        url = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['url']
        description = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['articleBody']
        return url, description
    else:
        return "no_match", "no_match"

In [35]:
# Print Wikipedia descriptions and URLs for entities
for token in doc.ents:
    url, description = returnGraphResult(token.text, key, token.label_) # key????????
    print(token.text, token.label_, url, description)

(George Washington, American, first, the United States, from 1789 to 1797)