In [13]:
# 영어 텍스트 분석 라이브러리
import nltk  

In [53]:
corpus = "I'm actively looking for Ph.D. students. and you are a Ph.D student."

### word_tokenize 를 활용한 토큰화

In [54]:
from nltk.tokenize import word_tokenize
word_tokenize(corpus)

['I',
 "'m",
 'actively',
 'looking',
 'for',
 'Ph.D.',
 'students',
 '.',
 'and',
 'you',
 'are',
 'a',
 'Ph.D',
 'student',
 '.']

### WordPunctTokenizer 를 활용한 토큰화

In [61]:
# nltk 단어 사전 다운로드
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
from nltk.tokenize import WordPunctTokenizer  
WordPunctTokenizer().tokenize(corpus)  

['I',
 "'",
 'm',
 'actively',
 'looking',
 'for',
 'Ph',
 '.',
 'D',
 '.',
 'students',
 '.',
 'and',
 'you',
 'are',
 'a',
 'Ph',
 '.',
 'D',
 'student',
 '.']

### TreebankWordTokenizer 를활용한 토큰화

In [46]:
from nltk.tokenize import TreebankWordTokenizer
TreebankWordTokenizer().tokenize(corpus)

['I', "'m", 'actively', 'looking', 'for', 'Ph.D.', 'students', '.']

### sent_tokenize 를 활용한 문장 토큰화

In [52]:
from nltk.tokenize import sent_tokenize
text="I am actively looking for Ph.D. students. and you are a Ph.D student."
sent_tokenize(text)

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']

In [58]:
### WordNetLemmatizer 를 활용한 표제어 추출

In [64]:
# 표제어 사전 다운로드
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
[n.lemmatize(w) for w in words]

['policy',
 'doing',
 'organization',
 'have',
 'going',
 'love',
 'life',
 'fly',
 'dy',
 'watched',
 'ha',
 'starting']

In [70]:
# dies 를 동사로 인식
n.lemmatize('dies', 'v')

'die'

In [69]:
# has 를 동사로 인식
n.lemmatize('has', 'v')

'have'

### PorterStemmer 를 활용한 어간 추출

In [72]:
from nltk.stem import PorterStemmer
s = PorterStemmer()
text="This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
words=word_tokenize(text)
print(words)

['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']


In [73]:
[s.stem(w) for w in words]

['thi',
 'wa',
 'not',
 'the',
 'map',
 'we',
 'found',
 'in',
 'billi',
 'bone',
 "'s",
 'chest',
 ',',
 'but',
 'an',
 'accur',
 'copi',
 ',',
 'complet',
 'in',
 'all',
 'thing',
 '--',
 'name',
 'and',
 'height',
 'and',
 'sound',
 '--',
 'with',
 'the',
 'singl',
 'except',
 'of',
 'the',
 'red',
 'cross',
 'and',
 'the',
 'written',
 'note',
 '.']

In [75]:
### LancasterStemmer 를 활용한 어간 추출

In [76]:
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
[l.stem(w) for w in words]

['policy',
 'doing',
 'org',
 'hav',
 'going',
 'lov',
 'liv',
 'fly',
 'die',
 'watch',
 'has',
 'start']

In [3]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

### 불용어 제거

In [81]:
from nltk.corpus import stopwords  
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [82]:
stopwords.words('english')[:10]  

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [83]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example)   
result = [] 

for w in word_tokens: 
    if w not in stop_words: 
        result.append(w) 

print(word_tokens) 
print(result) 

['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
