In [1]:
from konlpy.tag import Okt

In [2]:
okt = Okt()

```python

print(okt.morphs("도커에서 한국어 형태소 분석이 될까요?"))
```

In [3]:
import nltk

nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("averaged_perceptron_tagger")
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/developer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/developer/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/developer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/developer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [4]:
from nltk import tokenize

영문 토큰화

In [5]:

sentence = "Those who can imagine anything, can create the impossible."

word_tokens = tokenize.word_tokenize(sentence)
sent_tokens = tokenize.sent_tokenize(sentence)
print(word_tokens)
print(sent_tokens)

['Those', 'who', 'can', 'imagine', 'anything', ',', 'can', 'create', 'the', 'impossible', '.']
['Those who can imagine anything, can create the impossible.']


영문 품사 태깅

In [6]:
from nltk import tag
from nltk import tokenize


sentence = "Those who can imagine anything, can create the impossible."

word_tokens = tokenize.word_tokenize(sentence)
pos = tag.pos_tag(word_tokens)

print(pos)

[('Those', 'DT'), ('who', 'WP'), ('can', 'MD'), ('imagine', 'VB'), ('anything', 'NN'), (',', ','), ('can', 'MD'), ('create', 'VB'), ('the', 'DT'), ('impossible', 'JJ'), ('.', '.')]


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "Those who can imagine anything, can create the impossible."
doc = nlp(sentence)

for token in doc:
    print(f"[{token.pos_:5} - {token.tag_:3}] : {token.text}")

[PRON  - DT ] : Those
[PRON  - WP ] : who
[AUX   - MD ] : can
[VERB  - VB ] : imagine
[PRON  - NN ] : anything
[PUNCT - ,  ] : ,
[AUX   - MD ] : can
[VERB  - VB ] : create
[DET   - DT ] : the
[ADJ   - JJ ] : impossible
[PUNCT - .  ] : .


In [8]:
parallel_corpus = [
    {"en": "Hello.", "ko": "안녕하세요."},
    {"en": "How are you?", "ko": "잘 지내세요?"},
    {"en": "I am a student.", "ko": "저는 학생입니다."},
    {"en": "What is your name?", "ko": "당신의 이름은 무엇인가요?"},
    {"en": "My name is John.", "ko": "제 이름은 존입니다."},
    {"en": "Nice to meet you.", "ko": "만나서 반갑습니다."},
    {"en": "Good morning!", "ko": "좋은 아침이에요!"},
    {"en": "Good night.", "ko": "안녕히 주무세요."},
    {"en": "Where are you going?", "ko": "어디 가세요?"},
    {"en": "I’m going home.", "ko": "집에 가는 중이에요."},
    {"en": "See you later.", "ko": "나중에 봐요."},
    {"en": "Thank you very much.", "ko": "정말 감사합니다."},
    {"en": "You're welcome.", "ko": "천만에요."},
    {"en": "Excuse me.", "ko": "실례합니다."},
    {"en": "I’m sorry.", "ko": "죄송합니다."},
    {"en": "No problem.", "ko": "괜찮아요."},
    {"en": "Do you speak English?", "ko": "영어 할 줄 아세요?"},
    {"en": "I speak a little Korean.", "ko": "한국어 조금 할 줄 알아요."},
    {"en": "How much is this?", "ko": "이거 얼마예요?"},
    {"en": "Where is the restroom?", "ko": "화장실 어디예요?"},
    {"en": "Can you help me?", "ko": "도와주실 수 있나요?"},
    {"en": "I don’t understand.", "ko": "이해하지 못했어요."},
    {"en": "Please speak slowly.", "ko": "천천히 말씀해 주세요."},
    {"en": "I’m hungry.", "ko": "배고파요."},
    {"en": "I’m tired.", "ko": "피곤해요."},
    {"en": "What time is it?", "ko": "지금 몇 시예요?"},
    {"en": "Today is Monday.", "ko": "오늘은 월요일이에요."},
    {"en": "It’s raining.", "ko": "비가 오고 있어요."},
    {"en": "I like music.", "ko": "저는 음악을 좋아해요."},
    {"en": "Let’s go together.", "ko": "같이 가요."},
    {"en": "Have a nice day!", "ko": "좋은 하루 되세요!"},
    {"en": "Goodbye!", "ko": "안녕히 가세요!"}
]

In [14]:
en_tokens = set()
ko_tokens = set()

for line in parallel_corpus:
    ko_tokens.update(okt.morphs(line['ko']))
    #print(okt.morphs(line['ko']))
    #doc = nlp(sentence)
    en_list = [ token.text for token in nlp(line['en'])]
    #doc = nlp(line['en'])
    #print(en_list)
    en_tokens.update(en_list)

In [16]:
en_tokens

{'!',
 "'re",
 '.',
 '?',
 'Can',
 'Do',
 'English',
 'Excuse',
 'Good',
 'Goodbye',
 'Have',
 'Hello',
 'How',
 'I',
 'It',
 'John',
 'Korean',
 'Let',
 'Monday',
 'My',
 'Nice',
 'No',
 'Please',
 'See',
 'Thank',
 'Today',
 'What',
 'Where',
 'You',
 'a',
 'am',
 'are',
 'day',
 'do',
 'go',
 'going',
 'help',
 'home',
 'hungry',
 'is',
 'it',
 'later',
 'like',
 'little',
 'me',
 'meet',
 'morning',
 'much',
 'music',
 'name',
 'nice',
 'night',
 'n’t',
 'problem',
 'raining',
 'restroom',
 'slowly',
 'sorry',
 'speak',
 'student',
 'the',
 'this',
 'time',
 'tired',
 'to',
 'together',
 'understand',
 'very',
 'welcome',
 'you',
 'your',
 '’m',
 '’s'}

In [15]:
 ko_tokens

{'!',
 '.',
 '?',
 '가',
 '가는',
 '가세',
 '가요',
 '감사합니다',
 '같이',
 '거',
 '괜찮아요',
 '나중',
 '는',
 '당신',
 '도와주실',
 '되세요',
 '만나서',
 '말씀',
 '몇',
 '못',
 '무엇',
 '반갑습니다',
 '배고파요',
 '봐요',
 '비',
 '세',
 '수',
 '시',
 '실례',
 '아세요',
 '아침',
 '안녕하세요',
 '안녕히',
 '알아요',
 '어디',
 '얼마',
 '에',
 '에요',
 '영어',
 '예요',
 '오고',
 '오늘',
 '요',
 '월요일',
 '은',
 '을',
 '음악',
 '의',
 '이',
 '이름',
 '이에요',
 '이해',
 '인가요',
 '입니다',
 '있나요',
 '있어요',
 '잘',
 '저',
 '정말',
 '제',
 '조금',
 '존',
 '좋아해요',
 '좋은',
 '죄송합니다',
 '주무',
 '주세요',
 '줄',
 '중이',
 '지금',
 '지내세요',
 '집',
 '천만',
 '천천히',
 '피곤해요',
 '하루',
 '하지',
 '학생',
 '한국어',
 '할',
 '합니다',
 '해',
 '했어요',
 '화장실'}