- 케라스 토크나이저를 통해 Bag of Words

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentence = ["John likes to watch movies. Mary likes movies too! Mary also likes to watch football games."]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence) # 단어장 생성
print(tokenizer.word_index) # 각 단어에 대한 인코딩 결과 출력
vocab_size = len(tokenizer.word_index) + 1  # 0번 index는  padding용

bow = dict(tokenizer.word_counts) # 각 단어와 각 단어의 빈도를 bow에 저장

print("Bag of Words :", bow) # bow 출력
print('단어장(Vocabulary)의 크기 :', len(tokenizer.word_counts)) # 중복을 제거한 단어들의 개수

{'likes': 1, 'to': 2, 'watch': 3, 'movies': 4, 'mary': 5, 'john': 6, 'too': 7, 'also': 8, 'football': 9, 'games': 10}
Bag of Words : {'john': 1, 'likes': 3, 'to': 2, 'watch': 2, 'movies': 2, 'mary': 2, 'too': 1, 'also': 1, 'football': 1, 'games': 1}
단어장(Vocabulary)의 크기 : 10


- scikit-learn CountVectorizer 활용

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

sentence = ["John likes to watch movies. Mary likes movies too! Mary also likes to watch football games."]

vector = CountVectorizer()
bow = vector.fit_transform(sentence).toarray()

print('Bag of Words : ', bow) # 코퍼스로부터 각 단어의 빈도수를 기록한다.
print('각 단어의 인덱스 :', vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

print('단어장(Vocabulary)의 크기 :', len(vector.vocabulary_))

Bag of Words :  [[1 1 1 1 3 2 2 2 1 2]]
각 단어의 인덱스 : {'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}
단어장(Vocabulary)의 크기 : 10


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'John likes to watch movies',
    'Mary likes movies too',
    'Mary also likes to watch football games',    
]
vector = CountVectorizer()

print('Bag of Words : ', vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도수를 기록.
print('각 단어의 인덱스 :', vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.
print('단어장(Vocabulary)의 크기 :', len(vector.vocabulary_))

Bag of Words :  [[0 0 0 1 1 0 1 1 0 1]
 [0 0 0 0 1 1 1 0 1 0]
 [1 1 1 0 1 1 0 1 0 1]]
각 단어의 인덱스 : {'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}
단어장(Vocabulary)의 크기 : 10


**원-핫 벡터**

In [None]:
def one_hot_encoding(word, word2index):
       one_hot_vector = [0]*(len(word2index)) # 0으로 초기화
       index = word2index[word]
       one_hot_vector[index-1] = 1
       return one_hot_vector

In [2]:
# 케라스를 통한 원-핫 인코딩(one-hot encoding)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

text = [['강아지', '고양이', '강아지'],['애교', '고양이'], ['컴퓨터', '노트북']]
t = Tokenizer()
t.fit_on_texts(text)
print(t.word_index) # 각 단어에 대한 인코딩 결과 출력.

vocab_size = len(t.word_index) + 1

sub_text = ['강아지', '고양이', '강아지', '컴퓨터']
encoded = t.texts_to_sequences([sub_text])
print(encoded)

one_hot = to_categorical(encoded, num_classes = vocab_size)
print(one_hot)

{'강아지': 1, '고양이': 2, '애교': 3, '컴퓨터': 4, '노트북': 5}
[[1, 2, 1, 4]]
[[[0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0.]]]


**DTM(Document-Term Matrix)**
문서의 수가 많아지면 많아질수록, 통합 단어장의 크기도 커지게 되어서 
DTM은 결국 문서 벡터와 단어 벡터 모두 대부분의 값이 0이 되는 성질

BoW를 기반으로 문서를 비교

DTM에서 문서의 수와 단어의 수가 계속 늘어날수록, 행과 열은 대부분의 값이 0을 가진다는 특징이 있습니다. 이는 저장 공간 측면에서 낭비
단어의 빈도에만 집중하는 방법 자체의 한계

**TF-IDF**
DTM(TF)의 각 단어에 IDF 값 곱하기 -> TF-IDF 행렬
<단점>
- 토픽 한계
- 유사어 처리 안됨
--> LSA : 텍스트 데이터에서 숨겨진 주제(Topic)를 추출하는 데 사용되는 차원 축소 및 주제 모델링 기법
--> Word Embeddings 
--> ConceptNet

**문서유사도**
- cosine similarity with Bag of words
- cosine similarity with TF-IDF Bag of words

In [4]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

doc1 = np.array([0,1,1,1]) # 문서1 벡터
doc2 = np.array([1,0,1,1]) # 문서2 벡터
doc3 = np.array([2,0,2,2]) # 문서3 벡터

def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

print('{:.2f}'.format(cos_sim(doc1, doc2))) #문서1과 문서2의 코사인 유사도
print('{:.2f}'.format(cos_sim(doc1, doc3))) #문서1과 문서3의 코사인 유사도
print('{:.2f}'.format(cos_sim(doc2, doc3))) #문서2과 문서3의 코사인 유사도

0.67
0.67
1.00


**TF-IDF**

In [7]:
from math import log
import pandas as pd

docs = [
  'John likes to watch movies and Mary likes movies too',
  'James likes to watch TV',
  'Mary also likes to watch football games',  
]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
print('단어장의 크기 :', len(vocab))
print(vocab)

N = len(docs) # 총 문서의 수

def tf(t, d):
    return d.count(t)
 
def idf(t):
    df = 0
    for doc in docs:
        df += t in doc    
    return log(N/(df + 1)) + 1
 
def tfidf(t, d):
    return tf(t,d)* idf(t)

# TF
result = []
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        
        result[-1].append(tf(t, d))
        
tf_ = pd.DataFrame(result, columns = vocab)
print('TF:', tf_)

#IDF
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index = vocab, columns=["IDF"])
idf_
print('IDF:', idf_)

#TF-IDF
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        
        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

단어장의 크기 : 13
['James', 'John', 'Mary', 'TV', 'also', 'and', 'football', 'games', 'likes', 'movies', 'to', 'too', 'watch']
TF:    James  John  Mary  TV  also  and  football  games  likes  movies  to  too  \
0      0     1     1   0     0    1         0      0      2       2   2    1   
1      1     0     0   1     0    0         0      0      1       0   1    0   
2      0     0     1   0     1    0         1      1      1       0   1    0   

   watch  
0      1  
1      1  
2      1  
IDF:                IDF
James     1.405465
John      1.405465
Mary      1.000000
TV        1.405465
also      1.405465
and       1.405465
football  1.405465
games     1.405465
likes     0.712318
movies    1.405465
to        0.712318
too       1.405465
watch     0.712318


Unnamed: 0,James,John,Mary,TV,also,and,football,games,likes,movies,to,too,watch
0,0.0,1.405465,1.0,0.0,0.0,1.405465,0.0,0.0,1.424636,2.81093,1.424636,1.405465,0.712318
1,1.405465,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.712318,0.0,0.712318,0.0,0.712318
2,0.0,0.0,1.0,0.0,1.405465,0.0,1.405465,1.405465,0.712318,0.0,0.712318,0.0,0.712318


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
  'John likes to watch movies and Mary likes movies too',
  'James likes to watch TV',
  'Mary also likes to watch football games',  
]

tfidfv = TfidfVectorizer().fit(corpus)
vocab = list(tfidfv.vocabulary_.keys()) # 단어장을 리스트로 저장
vocab.sort() # 단어장을 알파벳 순으로 정렬

# TF-IDF 행렬에 단어장을 데이터프레임의 열로 지정하여 데이터프레임 생성
tfidf_ = pd.DataFrame(tfidfv.transform(corpus).toarray(), columns = vocab)
tfidf_

Unnamed: 0,also,and,football,games,james,john,likes,mary,movies,to,too,tv,watch
0,0.0,0.321556,0.0,0.0,0.0,0.321556,0.379832,0.244551,0.643111,0.189916,0.321556,0.0,0.189916
1,0.0,0.0,0.0,0.0,0.572929,0.0,0.338381,0.0,0.0,0.338381,0.0,0.572929,0.338381
2,0.464997,0.0,0.464997,0.464997,0.0,0.0,0.274634,0.353642,0.0,0.274634,0.0,0.0,0.274634


**LSA(Latent Semantic Analysis)**
선형 대수 기반의 주제 모델링 방법

**문서-단어 행렬(DTM)**에 **특이값 분해(SVD:Singular Value Decompotion)**를 적용하여 **잠재적인 의미 공간(latent semantic space)**을 찾음

단어 간 의미적 유사성을 포착하는 데 유리

(Uk, VkT, S)는 각각 '문서들과 관련된 의미들을 표현한 행렬', '단어들과 관련된 의미를 표현한 행렬' , '각 의미의 중요도를 표현한 행렬' 
VkT 행렬의 k열은 전체 코퍼스로부터 얻어낸 **k개의 주요 주제(topic)** 

SVD : 행렬의 크기를 감소시킨다. 
      가치가 높은순으로 정렬 -> 복원(가치가 높은 것만 복원)

In [10]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

import os

csv_filename = os.getenv('HOME')+'/aiffel/topic_modelling/data/abcnews-date-text.csv'

data = pd.read_csv(csv_filename, on_bad_lines='skip')
data.shape

text = data[['headline_text']].copy()
text.head()

text.nunique() # 중복을 제외하고 유일한 시퀀스를 가지는 샘플의 개수를 출력

text.drop_duplicates(inplace=True) # 중복 샘플 제거
text.reset_index(drop=True, inplace=True)
text.shape



(1054983, 1)

In [11]:
# NLTK 토크나이저를 이용해서 토큰화
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [12]:
# 단어 정규화. 3인칭 단수 표현 -> 1인칭 변환, 과거형 동사 -> 현재형 동사 등을 수행한다.
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

# 길이가 1 ~ 2인 단어는 제거.
text = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [13]:
# 역토큰화 (토큰화 작업을 역으로 수행)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(text[i])
    detokenized_doc.append(t)

train_data = detokenized_doc
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [14]:
# 상위 5000개의 단어만 사용하여  DTM 생성
c_vectorizer = CountVectorizer(stop_words='english', max_features = 5000)
document_term_matrix = c_vectorizer.fit_transform(train_data)
print('행렬의 크기 :',document_term_matrix.shape)

행렬의 크기 : (1054983, 5000)


**scikit-learn TruncatedSVD 활용**

In [15]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
lsa_model = TruncatedSVD(n_components = n_topics)
lsa_model.fit_transform(document_term_matrix)
print(lsa_model.components_.shape)

terms = c_vectorizer.get_feature_names_out() # 단어 집합. 5,000개의 단어가 저장됨.
print('단어 집합의 크기 :',term.shape)

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        # topic.argsort : 가중치를 오름차순으로 정렬, [::-1]->내림차순으로 출력

get_topics(lsa_model.components_, terms)

(10, 5000)
Topic 1: [('police', 0.74635), ('man', 0.4535), ('charge', 0.21091), ('new', 0.14089), ('court', 0.11147)]
Topic 2: [('man', 0.69424), ('charge', 0.30028), ('court', 0.16674), ('face', 0.11591), ('murder', 0.10654)]
Topic 3: [('new', 0.83673), ('plan', 0.23648), ('say', 0.18275), ('govt', 0.11054), ('council', 0.10968)]
Topic 4: [('say', 0.73893), ('plan', 0.35811), ('govt', 0.16605), ('council', 0.12836), ('urge', 0.07702)]
Topic 5: [('plan', 0.73386), ('council', 0.17595), ('govt', 0.14235), ('urge', 0.09338), ('water', 0.08441)]
Topic 6: [('govt', 0.54915), ('court', 0.26289), ('urge', 0.22067), ('fund', 0.20663), ('face', 0.17575)]
Topic 7: [('charge', 0.52349), ('court', 0.44652), ('face', 0.37521), ('plan', 0.12095), ('murder', 0.11957)]
Topic 8: [('win', 0.57842), ('court', 0.38812), ('kill', 0.20302), ('crash', 0.15088), ('australia', 0.09559)]
Topic 9: [('win', 0.67033), ('charge', 0.42204), ('cup', 0.09574), ('australia', 0.09296), ('world', 0.08933)]
Topic 10: [('

In [16]:
terms.shape

(5000,)

**scikit-learn LDA Model 활용**

In [18]:
# 상위 5,000개의 단어만 사용
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tf_idf_matrix = tfidf_vectorizer.fit_transform(train_data)

# TF-IDF 행렬의 크기를 확인해봅시다.
print('행렬의 크기 :', tf_idf_matrix.shape)

from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777, max_iter=1)
lda_model.fit_transform(tf_idf_matrix)
print(lda_model.components_.shape)

terms = tfidf_vectorizer.get_feature_names_out() # 단어 집합. 5,000개의 단어가 저장됨.
print('단어 집합의 크기 :',terms.shape)

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        # topic.argsort : 가중치를 오름차순으로 정렬, [::-1]->내림차순으로 출력

get_topics(lda_model.components_, terms)

행렬의 크기 : (1054983, 5000)
(10, 5000)
단어 집합의 크기 : (5000,)
Topic 1: [('australia', 9359.06334), ('sydney', 5854.97288), ('attack', 4784.76322), ('change', 4193.63035), ('year', 3924.88997)]
Topic 2: [('government', 6344.07413), ('charge', 5947.12292), ('man', 4519.7974), ('state', 3658.16422), ('live', 3625.10473)]
Topic 3: [('australian', 7666.65651), ('say', 7561.01807), ('police', 5513.22932), ('home', 4048.38409), ('report', 3796.04446)]
Topic 4: [('melbourne', 5298.35047), ('south', 4844.59835), ('death', 4281.78433), ('china', 3214.44581), ('women', 3029.28443)]
Topic 5: [('win', 5704.0914), ('canberra', 4322.0963), ('die', 4025.63057), ('open', 3771.65243), ('warn', 3577.47151)]
Topic 6: [('court', 5246.3124), ('world', 4536.86331), ('country', 4166.34794), ('woman', 3983.97748), ('crash', 3793.50267)]
Topic 7: [('election', 5418.5038), ('adelaide', 4864.95604), ('house', 4478.6135), ('school', 3966.82676), ('2016', 3955.11155)]
Topic 8: [('trump', 8189.58575), ('new', 6625.2724), 

**soynlp 형태소 분석기 : 비지도학습**

In [None]:
# soynlp의 응집 확률(cohesion probability)
# 이 값이 높을수록 전체 코퍼스에서 이 문자열 시퀀스는 하나의 단어로 등장할 가능성이 높습니다

# soynlp의 브랜칭 엔트로피(branching entropy)
# 주어진 문자열에서 다음 문자가 등장할 수 있는 가능성을 판단하는 척도


In [19]:
txt_filename = os.getenv('HOME')+'/aiffel/topic_modelling/data/2016-10-20.txt'

from soynlp import DoublespaceLineCorpus

# 말뭉치에 대해서 다수의 문서로 분리
corpus = DoublespaceLineCorpus(txt_filename)
len(corpus)

from soynlp.word import WordExtractor

word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 2.166 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


In [None]:
# 띄워쓰기가 잘 된 문장 : L토크나이저 사용
from soynlp.tokenizer import LTokenizer

scores = {word:score.cohesion_forward for word, score in word_score_table.items()}
l_tokenizer = LTokenizer(scores=scores)
l_tokenizer.tokenize("국제사회와 우리의 노력들로 범죄를 척결하자", flatten=False)

In [None]:
# 띄워쓰기가 잘 안된 문장 : 최대점수 토크나이저 
from soynlp.tokenizer import MaxScoreTokenizer

maxscore_tokenizer = MaxScoreTokenizer(scores=scores)
maxscore_tokenizer.tokenize("국제사회와우리의노력들로범죄를척결하자")