#### 【 자연어 처리용 형태소분석 - NLTK 】
- 단어/어휘사전 추출 
    * 정제
    * 토큰화
    * 불용어/구두점
    * 어근추출 => 표제어/어간추출

[1] 모듈 로딩 및 데이터 준비<hr>

In [48]:
## [1-1] 모듈 로딩
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from string import punctuation
from nltk.tag import pos_tag 
from nltk.stem import WordNetLemmatizer, PorterStemmer 

from collections import Counter 

In [49]:
## [1-2] 데이터 준비
DATA_FILE = '../Data/Louis_Jean_Joseph.txt'

In [50]:
## [1-3] 데이터 로딩
with open(DATA_FILE, mode='r') as f:
    allData = f.read()

print(f'allData => {len(allData)}개')
print(f'allData => {allData[:100]}')

allData => 11101개
allData => Louis Jean Joseph Leblanc (born January 26, 1991) is a Canadian former professional ice hockey playe


[2] 텍스트 데이터 전처리<hr>
- 기본 정제 :  구두점, 언어별 불용어
- 토큰화 진행 => 토큰 기반 정제 필요할 수 있음!
- 어근/원형 변환 => 어간추출/표제어추출 
- 영어 경우 대소문자 일치

In [51]:
## ====================================================================
## 품사태깅작업 : 정확하게 형태소 분리 위해서 진행 => 토큰 단위로 진행
## ====================================================================
text = allData[:100]

tk_pos = pos_tag( word_tokenize(text) , tagset='universal' )
tkList = []
for tk, pos in tk_pos:
    if pos in ['NOUN', 'VERB']:
        tkList.append(tk)

print("선택된 태그 : ", tkList)

선택된 태그 :  ['Louis', 'Jean', 'Joseph', 'Leblanc', 'born', 'January', 'is', 'professional', 'ice', 'hockey', 'playe']


In [52]:
## ====================================================================
## [2-1] 전처리 준비
## ====================================================================
## - 노이즈 데이터 필터링 준비
puncList = list(punctuation)
swList   = stopwords.words("english")

## - 영문 대소문자 일치
allData = allData.lower()

In [None]:
## ====================================================================
## [2-2] 토큰화 진행 
## -> 문장 단위 인코딩을 위한 문장 단위 토큰 저장
## -> 어휘사전용 토큰 저장 
## ====================================================================
allTKList, sentTKList = [], []

sentList = sent_tokenize(allData)
print(f'sentList: {len(sentList)}개')

wnLem = WordNetLemmatizer()

for sent in sentList:
    wTK = word_tokenize(sent)

    sentTK = []
    tk_pos = pos_tag( wTK , tagset='universal' )
    for tk, pos in tk_pos:
        if (pos in ['NOUN', 'VERB']) and (tk not in swList+puncList):
            if pos == 'VERB': tk=wnLem.lemmatize(tk, pos='v')
            allTKList.append(tk)
            sentTK.append(tk)
    if sentTK: sentTKList.append(sentTK)

print(f'allTKList => {len(allTKList)}개\n{allTKList}')
print(f'sentTKList  => {len(sentTKList)}개')
print(sentTKList[-1])


sentList: 70개
allTKList => 794개
['jean', 'joseph', 'leblanc', 'january', 'professional', 'ice', 'hockey', 'player', 'centre', 'leblanc', 'play', 'hockey', 'region', 'move', 'states', 'play', 'season', 'lancers', 'states', 'hockey', 'league', 'become', 'rookie', 'year', 'entry', 'draft', 'select', 'montreal', 'canadiens', 'enrol', 'harvard', 'university', 'spend', 'season', 'crimson', 'name', 'ivy', 'league', 'rookie', 'year', 'sign', 'contract', 'canadiens', 'year', 'leblanc', 'join', 'montreal', 'juniors', 'acquire', 'quebec', 'junior', 'hockey', 'league', 'qmjhl', 'playing', 'rights', 'leblanc', 'spend', 'seasons', 'canadiens', 'organization', 'play', 'hockey', 'league', 'ahl', 'affiliate', 'appear', 'hockey', 'league', 'games', 'seasons', 'trade', 'anaheim', 'ducks', 'keep', 'ahl', 'leblanc', 'move', 'europe', 'join', 'hc', 'slovan', 'bratislava', 'hockey', 'league', 'khl', 'play', 'games', 'release', 'appear', 'games', 'hc', 'league', 'retire', 'hockey', 'play', 'ivan', 'hlinka', '

[3] 단어/어휘 사전 생성<hr>

In [59]:
## [3-1] 토큰의 빈도수 체크 및 내림차순 정렬 
allTKDict  = dict(Counter(allTKList))
rAllTKList = sorted(allTKDict.items(), key=lambda k: k[1], reverse=True)
rAllTKList

[('leblanc', 42),
 ('hockey', 21),
 ('play', 21),
 ('team', 19),
 ('canadiens', 18),
 ('league', 17),
 ('games', 16),
 ('season', 13),
 ('montreal', 11),
 ('points', 11),
 ('year', 10),
 ('score', 10),
 ('draft', 8),
 ('ahl', 8),
 ('goals', 8),
 ('sign', 7),
 ('contract', 7),
 ('junior', 7),
 ('assists', 7),
 ('harvard', 6),
 ('university', 6),
 ('juniors', 6),
 ('tournament', 6),
 ('canada', 6),
 ('goal', 6),
 ('camp', 6),
 ('move', 5),
 ('rookie', 5),
 ('join', 5),
 ('rights', 5),
 ('fail', 5),
 ('hamilton', 5),
 ('record', 5),
 ('players', 5),
 ('states', 4),
 ('entry', 4),
 ('select', 4),
 ('spend', 4),
 ('name', 4),
 ('quebec', 4),
 ('qmjhl', 4),
 ('affiliate', 4),
 ('trade', 4),
 ('ducks', 4),
 ('hc', 4),
 ('medal', 4),
 ('world', 4),
 ('lead', 4),
 ('finish', 4),
 ('bulldogs', 4),
 ('assist', 4),
 ('ice', 3),
 ('player', 3),
 ('seasons', 3),
 ('anaheim', 3),
 ('slovan', 3),
 ('khl', 3),
 ('release', 3),
 ('retire', 3),
 ('gold', 3),
 ('consider', 3),
 ('age', 3),
 ('return', 3),

In [60]:
## [3-2] 단어/어휘 사전 생성
vocab = {'oov': 0}
for tk, _ in rAllTKList:
    vocab[tk]=len(vocab)
vocab

{'oov': 0,
 'leblanc': 1,
 'hockey': 2,
 'play': 3,
 'team': 4,
 'canadiens': 5,
 'league': 6,
 'games': 7,
 'season': 8,
 'montreal': 9,
 'points': 10,
 'year': 11,
 'score': 12,
 'draft': 13,
 'ahl': 14,
 'goals': 15,
 'sign': 16,
 'contract': 17,
 'junior': 18,
 'assists': 19,
 'harvard': 20,
 'university': 21,
 'juniors': 22,
 'tournament': 23,
 'canada': 24,
 'goal': 25,
 'camp': 26,
 'move': 27,
 'rookie': 28,
 'join': 29,
 'rights': 30,
 'fail': 31,
 'hamilton': 32,
 'record': 33,
 'players': 34,
 'states': 35,
 'entry': 36,
 'select': 37,
 'spend': 38,
 'name': 39,
 'quebec': 40,
 'qmjhl': 41,
 'affiliate': 42,
 'trade': 43,
 'ducks': 44,
 'hc': 45,
 'medal': 46,
 'world': 47,
 'lead': 48,
 'finish': 49,
 'bulldogs': 50,
 'assist': 51,
 'ice': 52,
 'player': 53,
 'seasons': 54,
 'anaheim': 55,
 'slovan': 56,
 'khl': 57,
 'release': 58,
 'retire': 59,
 'gold': 60,
 'consider': 61,
 'age': 62,
 'return': 63,
 'go': 64,
 'ushl': 65,
 'tie': 66,
 'september': 67,
 'june': 68,
 'mak

[4] 인코딩 : 문장 ==> 수치화 <hr>

In [None]:
## 인코딩된 문장 리스트 저장용
sentEncodeList = []

## 문장별 토큰을 숫자/정수 인코딩 진행
for sent in sentTKList:
    sentEncode=[]
    for tk in sent:
        sentEncode.append(vocab.get(tk, 0))

    ## 정수 인코딩된 리스트를 추가    
    sentEncodeList.append(sentEncode)


[74, 117, 1, 118, 119, 50, 2, 51]
[120, 1, 8, 2, 75, 52, 33, 12, 7, 76, 33, 2, 5, 121, 27, 11]
[34, 13, 35, 9, 4]
[77, 19, 20, 36, 7, 78, 37, 79, 5, 27, 11, 21, 16, 4, 11, 1, 80, 9, 22, 122, 38, 17, 2, 5, 39, 12, 28]
[1, 36, 53, 4, 123, 12, 2, 5, 14, 40, 81, 2, 5, 6, 53, 41, 54, 42, 124, 14]
[1, 52, 125, 82, 43, 55, 83, 2, 5, 56, 8, 6, 84]
[81, 6, 43, 5, 126, 2]
[8, 85, 86, 23, 24, 57, 44, 45, 17, 50, 2, 87, 127, 24, 128, 129, 44]
[1, 58, 13, 130, 59, 88, 131, 2, 60]
[89, 1, 132, 90, 91, 1, 133, 92, 9, 90, 1, 134, 135, 136, 137, 45, 138, 9, 139, 140, 141, 142, 143, 144, 145, 9, 146]
[91, 147, 148]
[93, 149, 74, 150, 2]
[151, 152, 153, 154, 155, 38, 1, 156, 60, 157, 158, 159]
[160, 8, 161, 2, 23, 94, 2, 95]
[12, 94, 162, 163, 164, 165, 166, 60, 80, 96, 93, 97, 167, 3, 168, 9]
[8, 53, 97, 46, 5, 29, 169]
[1, 35, 170, 38, 17, 2, 5, 39, 13]
[171, 172, 20, 173, 76, 33, 2, 5, 61, 36, 7, 12, 6, 174, 15, 18, 10]
[46, 3, 10, 30, 62, 175, 61, 25, 176, 62, 15, 1, 30, 62]
[177, 178, 7, 61, 37, 5, 