In [1]:
import nltk
import pandas as pd

In [2]:
# Load Dataset

In [9]:
train = pd.read_csv("input/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)

train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [12]:
## Stemmer 
문법의 특징으로부터 데이터를 변환

In [13]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
stemmer

<nltk.stem.snowball.SnowballStemmer at 0x1a0c07b9b0>

In [14]:
# KoNLPy (한국어에서 사용 가능)

In [17]:
stemmer.stem('foods')

'food'

In [18]:
stemmer.stem('recommended')

'recommend'

In [20]:
stemmer.stem('studied') 
# stemmer는 단어의 어근 형태로 변형한다. (studi로 변형된 것으로 알 수 있음)
# 과거형이 중요한 정보를 가질 때는 stemmer를 쓰면 안된다 (그런 경우가 있나보다)

'studi'

In [22]:
Phrase = train.loc[2274]["Phrase"]
Phrase

'Highly recommended viewing for its courage , ideas , technical proficiency and great acting .'

In [32]:
words = Phrase.split(" ")

stemmed_words = []
for word in words:
    stemmed_word = stemmer.stem(word)
    stemmed_words.append(stemmed_word)
    print(word, stemmed_word)
    
stemmed_phrase = " ".join(stemmed_words)
stemmed_phrase

# 아래와 같이 표현이 가능하다 (for문 줄이기)
words = Phrase.split(" ")
stemmed_words = [stemmer.stem(w) for w in words]
stemmed_phrase = " ".join(stemmed_words)
stemmed_phrase

Highly high
recommended recommend
viewing view
for for
its it
courage courag
, ,
ideas idea
, ,
technical technic
proficiency profici
and and
great great
acting act
. .


'high recommend view for it courag , idea , technic profici and great act .'

In [40]:
!pip install tqdm



In [41]:
from tqdm import tqdm

In [44]:
# 함수화하여 쉽게 사용하기


def stem_phrase(phrase): 
    words = phrase.split(" ")
    stemmed_words = [stemmer.stem(w) for w in words]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="stemming...")
train["Phrase(Stemmed)"] = train["Phrase"].progress_apply(stem_phrase)

train.head()

stemming...: 100%|██████████| 156060/156060 [00:16<00:00, 9496.45it/s]


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Stemmed)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,a seri of escapad demonstr the adag that what ...
2,1,A series of escapades demonstrating the adage ...,2,a seri of escapad demonstr the adag that what ...
3,1,A series,2,a seri
4,1,A,2,a
5,1,series,2,seri


In [39]:
train["Phrase"]

PhraseId
1         A series of escapades demonstrating the adage ...
2         A series of escapades demonstrating the adage ...
3                                                  A series
4                                                         A
5                                                    series
6         of escapades demonstrating the adage that what...
7                                                        of
8         escapades demonstrating the adage that what is...
9                                                 escapades
10        demonstrating the adage that what is good for ...
11                                  demonstrating the adage
12                                            demonstrating
13                                                the adage
14                                                      the
15                                                    adage
16                          that what is good for the goose
17                             

In [37]:
train["Phrase"].apply(stem_phrase)

PhraseId
1         high recommend view for it courag , idea , tec...
2         high recommend view for it courag , idea , tec...
3         high recommend view for it courag , idea , tec...
4         high recommend view for it courag , idea , tec...
5         high recommend view for it courag , idea , tec...
6         high recommend view for it courag , idea , tec...
7         high recommend view for it courag , idea , tec...
8         high recommend view for it courag , idea , tec...
9         high recommend view for it courag , idea , tec...
10        high recommend view for it courag , idea , tec...
11        high recommend view for it courag , idea , tec...
12        high recommend view for it courag , idea , tec...
13        high recommend view for it courag , idea , tec...
14        high recommend view for it courag , idea , tec...
15        high recommend view for it courag , idea , tec...
16        high recommend view for it courag , idea , tec...
17        high recommend view f

## Lemmatizer
결과가 단어로 나온다. (어근X) 
데이터 분석적으로 판단한다. (2개 중 하나만 쓰기도하고 둘다 쓰기도한다) 


In [46]:
# http://www.nltk.org/api/nltk.stem.html
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatizer

<WordNetLemmatizer>

In [48]:
stemmer.stem('feet'), lemmatizer.lemmatize('feet')

('feet', 'foot')

In [50]:
stemmer.stem('studies'), lemmatizer.lemmatize('studies')

('studi', 'study')

In [53]:
stemmer.stem('went'), lemmatizer.lemmatize('went')
#lemmatizer는 단어가 동사인지 명사인지에 따라 결과가 달라진다 

('went', 'went')

In [55]:
# pos === part of speech
stemmer.stem('went'), lemmatizer.lemmatize('went', pos='v') # 동사로 할 경우

('went', 'go')

### Pos Tagger

In [72]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [73]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

phrase = train.loc[2274]["Phrase"]
phrase

word_tokenize

<function nltk.tokenize.word_tokenize>

In [74]:
word_tokenize(phrase)

['Highly',
 'recommended',
 'viewing',
 'for',
 'its',
 'courage',
 ',',
 'ideas',
 ',',
 'technical',
 'proficiency',
 'and',
 'great',
 'acting',
 '.']

In [89]:
# https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used
words_pos = pos_tag(word_tokenize(phrase))
words_pos

["_".join(w) for w in words_pos]

['the_DT', 'door_NN', 'is_VBZ', 'already_RB', 'closed_VBN']

In [140]:
# close를 비교해보면 문맥에 따라 동사인지 명사인지 구분이 가능하다
# 이럴 때는 close_VB, close_RB와 같이 하나의 단어로 만들어 count하는 방법도 있다
import numpy as np
import pandas as pd

phrase = "you should closed the door"
words = phrase.split(" ")

stemmed_words = [stemmer.stem(w) for w in words]
stemmed_phrase = " ".join(stemmed_words)

ok = pos_tag(word_tokenize(phrase))
test = [[o[0], o[1]] for o in ok]
test = ["_".join([stemmer.stem(o[0]), o[1]]) for o in ok]
test

['you_PRP', 'should_MD', 'close_VBD', 'the_DT', 'door_NN']

In [163]:
phrase = "the door is already closed"
words = phrase.split(" ")

stemmed_words = [stemmer.stem(w) for w in words]
stemmed_phrase = " ".join(stemmed_words)

def distributionPoS(tag):
    return tag[0]

def post_tag_lemm_v_or_n_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([lemmatizer.lemmatize(o[0]), distributionPoS(o[1])]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

def post_tag_lemm_stem_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([stemmer.stem(lemmatizer.lemmatize(o[0])), distributionPoS(o[1])]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

print(post_tag_lemm_stem_phrase(phrase))
print(post_tag_lemm_v_or_n_phrase(stemmed_phrase))

the_D door_N is_V alreadi_R close_V
the_D door_N is_V alreadi_V close_R


'the_DT'

In [166]:
test = "A23"

test[0].lower()

'a'