<a href="https://colab.research.google.com/github/YounSooKimTech/NLP_Repo/blob/main/Preprocessing_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# preprocessing

- tokenization
- stopwords/punc removal
- normalization via stemming(어간추출) and lemmatization(표제어 추출)
- POS tagging (품사태깅, 문맥파악)

In [3]:
! pip install nltk

import nltk

nltk.download("punkt")
nltk.download("webtext")
nltk.download("wordnet")
nltk.download("stopwrods")
nltk.download("average_perceptron_tagger")
nltk.download("omw-1.4")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Error loading stopwrods: Package 'stopwrods' not found in
[nltk_data]     index
[nltk_data] Error loading average_perceptron_tagger: Package
[nltk_data]     'average_perceptron_tagger' not found in index
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# nltk.tokenize

In [15]:
para = """Hello everyone.
 It's good to see you. 
 Let's start out text mining class!
 """

from nltk.tokenize import sent_tokenize, word_tokenize

sents = sent_tokenize(para)
print([sent for sent in sents], "\n")

words = word_tokenize(para)

print([word+"\n" for word in words])

['Hello everyone.', "It's good to see you.", "Let's start out text mining class!"] 

['Hello\n', 'everyone\n', '.\n', 'It\n', "'s\n", 'good\n', 'to\n', 'see\n', 'you\n', '.\n', 'Let\n', "'s\n", 'start\n', 'out\n', 'text\n', 'mining\n', 'class\n', '!\n']


In [33]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w]{3,}")

tokenizer.tokenize("sorry, I can;t go there")

['sorry', 'can', 'there']

In [24]:
# Regular expression을 사용하는 

import re

print(re.findall("[abc]", "How are you boys?"))

print(re.findall("[0-9]", "3a7b5c9d"))

# pattern \w = [a-zA-Z0-9] 공백이 포함되지 않는다
# \w + 면 공백 쉼표로 구분되는 단어 찾을수 있음

print(re.findall("[\w]+", "How are you, boys?"))

print(re.findall("[o]{2,4}", "Oh, how are yoou? booooy"))

['a', 'b']
['3', '7', '5', '9']
['How', 'are', 'you', 'boys']
['oo', 'oooo']


In [36]:
# noise and stopwords
# 대개 영어는 길이가 3미만인 단어는 삭제하는것이 일반적
import nltk

nltk.download("stopwords")

from nltk.corpus import stopwords

english_stop = stopwords.words("english")
english_stop[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

# nltk.corpus stopwords

In [44]:
# tokenize --> remove stopwords and len(word)>=3

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

text1 = "sorry, I couldn't go to the movie yesterday"

eng_stopwords = stopwords.words("english")
eng_stopwords = set(eng_stopwords)

tokenizer = RegexpTokenizer("[\w]+")


words = tokenizer.tokenize(text1)
print([word.lower() for word in words if word not in eng_stopwords and len(word)>3])



['sorry', 'movie', 'yesterday']


In [43]:
my_stopwords = ["I", "Go", "to"]

print([word for word in words if word not in my_stopwords and len(word)>3])

['sorry', 'couldn', 'movie', 'yesterday']


# nltk.stem

In [45]:
# Porter Stemmer

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmer.stem("cooking cookery cookies")

'cooking cookery cooki'

In [72]:
text1 = "Hello, everyone. It's good to see you. Let's start our text mining class!"

from nltk.tokenize import word_tokenize

tokens = word_tokenize(text1.lower())
print(tokens)

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, LancasterStemmer

stemmer = PorterStemmer()

print([stemmer.stem(token) for token in tokens])

'''
stemmerL = LancasterStemmer()
print([stemmerL.stem(token) for token in tokens])
'''

['hello', ',', 'everyone', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['hello', ',', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mine', 'class', '!']


'\nstemmerL = LancasterStemmer()\nprint([stemmerL.stem(token) for token in tokens])\n'

In [71]:
# Word Net Lemma

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cooking"))
print(lemmatizer.lemmatize("cooking", pos="v"))


print(lemmatizer.lemmatize("cookbooks"))

cooking
cook
cookbook


In [82]:
# nltk.pos_tag

import nltk
from nltk.tokenize import word_tokenize
#nltk.download("averaged_perceptron_tagger")

text = '''
Hello, everyone.
It's gooe to see you
Let's start out text mining class :>
'''

tokens = word_tokenize(text.lower())

print(nltk.pos_tag(tokens))
print("\n")


[('hello', 'NN'), (',', ','), ('everyone', 'NN'), ('.', '.'), ('it', 'PRP'), ("'s", 'VBZ'), ('gooe', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('out', 'RP'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), (':', ':'), ('>', 'NN')]




In [83]:

for word, tag in nltk.pos_tag(tokens):
  if tag in ["NN", "VB"]:
    print(word, tag)

hello NN
everyone NN
see VB
let VB
start VB
text NN
mining NN
class NN
> NN


In [84]:
! pip install KoNLPy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting KoNLPy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[K     |████████████████████████████████| 465 kB 65.3 MB/s 
Installing collected packages: JPype1, KoNLPy
Successfully installed JPype1-1.4.1 KoNLPy-0.6.0
