<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/English_Clean%26Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Data Load

In [2]:
# glue data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print("train : {}".format(len(train_sentences)))
print("test : {}".format(len(test_sentences)))

print(train_sentences[:2])
print(test_sentences[:2])


train : 8551
test : 1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up."]
['Bill whistled past the house.', 'The car honked its way down the road.']


## Normalization (정규화)

In [44]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# Stemming (어간 추출) & Lemmatization (표제어 추출)

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

tokens = word_tokenize(train_sentences[0])

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


stem_tokens = [stemmer.stem(t) for t in tokens]
lem_tokens = [lemmatizer.lemmatize(t) for t in tokens]


print("tokens    : {}".format(tokens))
print("stem      : {}".format(stem_tokens))
print("lemmatize : {}".format(lem_tokens))




tokens    : ['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
stem      : ['our', 'friend', 'wo', "n't", 'buy', 'thi', 'analysi', ',', 'let', 'alon', 'the', 'next', 'one', 'we', 'propos', '.']
lemmatize : ['Our', 'friend', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
have


In [42]:
# print(lemmatizer.lemmatize('has', 'v'))          # 품사를 지정해주면 정확한 표제어를 얻을 수 있다.

from nltk import pos_tag

def lemma(pos_list):
  ''' 
  Lemmatize 함수가 입력받는 품사는 동사(v), 형용사(a), 명사(n), 부사(r)이다. 
  nltk.pos_tag로 반환 받는 형용사는 J이기 때문에 Lemmatize를 위해 n으로 pos를 변경한다.
  '''

  lemmatizer = WordNetLemmatizer()

  lemma_list = []
  for token, pos in pos_list:

    if pos[0] in ['V', 'J', 'N', 'R']:

      if pos[0] == 'J':
        pos = 'a'
      else :
        pos = pos.lower()

      lemma_token = lemmatizer.lemmatize(token, pos[0])      
      lemma_list.append(lemma_token)

  return lemma_list


pos_tokens = pos_tag(tokens)

my_lemm_tokens = lemma(pos_tokens)

print(pos_tokens)
print(my_lemm_tokens)


[('Our', 'PRP$'), ('friends', 'NNS'), ('wo', 'MD'), ("n't", 'RB'), ('buy', 'VB'), ('this', 'DT'), ('analysis', 'NN'), (',', ','), ('let', 'VB'), ('alone', 'RB'), ('the', 'DT'), ('next', 'JJ'), ('one', 'NN'), ('we', 'PRP'), ('propose', 'VBP'), ('.', '.')]
['friend', "n't", 'buy', 'analysis', 'let', 'alone', 'next', 'one', 'propose']


## Clean (정제)

In [46]:
# Clean (Stopword) / After lemmatization

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

no_stopword = []
for t in lem_tokens:
  if t not in stop_words:
    no_stopword.append(t)

print('lemma tokens     : {}'.format(lem_tokens))
print('removed stopword : {}'.format(no_stopword))


lemma tokens     : ['Our', 'friend', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
removed stopword : ['Our', 'friend', 'wo', "n't", 'buy', 'analysis', ',', 'let', 'alone', 'next', 'one', 'propose', '.']
