<a href="https://colab.research.google.com/github/alalshow/AIpjt-1/blob/master/notebooks/word_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bag of words

In [1]:
corpus = [
    '학교에 가서 수업을 들었다. 학교에 간건 오랜만이다.',
    '학교에 가서 친구 얘기를 들었다.',
    '내일 가서 뭐 먹지?'
]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'학교에': 9,
 '가서': 0,
 '수업을': 5,
 '들었다': 3,
 '간건': 1,
 '오랜만이다': 7,
 '친구': 8,
 '얘기를': 6,
 '내일': 2,
 '먹지': 4}

In [3]:
vect.transform(corpus).toarray()

array([[1, 1, 0, 1, 0, 1, 0, 1, 0, 2],
       [1, 0, 0, 1, 0, 0, 1, 0, 1, 1],
       [1, 0, 1, 0, 1, 0, 0, 0, 0, 0]])

In [4]:
vect.transform(['수업을 들었다. 수업은 재미있다.']).toarray()

array([[0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])

# TFIDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.23642005, 0.40029393, 0.        , 0.30443385, 0.        ,
        0.40029393, 0.        , 0.40029393, 0.        , 0.60886771],
       [0.31544415, 0.        , 0.        , 0.40619178, 0.        ,
        0.        , 0.53409337, 0.        , 0.53409337, 0.40619178],
       [0.38537163, 0.        , 0.65249088, 0.        , 0.65249088,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

# Word2vec


In [6]:
# make datasets (.txt file)
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

import pandas as pd

df = pd.read_csv('ratings_train.txt', sep='\t')
doc = list(df['document'])

with open('ratings_train_text_only.txt', 'w') as f:
  for text in doc:
    f.write(str(text) + '\n')

--2023-01-04 12:02:59--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2023-01-04 12:03:00 (252 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]



In [7]:
# read text file
with open('ratings_train_text_only.txt', 'r') as f:
  texts = [str(text).replace('\n', '') for text in doc if len(str(text)) >= 10]

In [8]:
# word2vec training

import os
from gensim.models import Word2Vec

def word2vec(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    print('word2vec training...')
    model = Word2Vec(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims

    model.save('word2vec')

word2vec(texts)



number of text =  136748
word2vec training...


In [9]:
w2v = Word2Vec.load('word2vec')

In [10]:
# 단어 벡터
w2v.wv['감동']

array([ 0.5881887 , -0.9992241 ,  0.82709754, -0.11130765,  0.23319794,
       -0.05065594,  1.1928968 ,  0.03060822, -0.47597724,  0.40897316,
       -0.24473684,  0.32213312, -0.14504497,  0.11221211, -1.3178905 ,
       -0.27685177, -0.5231598 ,  0.5173008 , -1.2792141 , -0.2944419 ,
       -0.21998513, -0.43803662, -0.02366319,  0.56841147,  0.86567426,
        0.0668866 ,  0.13169307,  0.02296105, -0.3144447 ,  0.5146612 ,
       -0.04159123,  0.07162941,  0.35347047,  0.5582885 ,  0.5201197 ,
        0.1353152 , -0.8289469 , -0.13135523, -1.017658  ,  0.0420857 ,
       -0.1087289 , -0.6668211 ,  0.13523474,  0.5527889 , -0.53185517,
        0.01137959, -0.16652793, -0.40015072, -0.21456513, -0.04345952],
      dtype=float32)

In [11]:
# 유사 단어
w2v.wv.most_similar('이제까지')

[('개봉관에서', 0.96207195520401),
 ('여태것', 0.9608229994773865),
 ('10년동안', 0.9552174806594849),
 ('하나였는데', 0.9524528384208679),
 ('3년간', 0.9513561725616455),
 ('내자신이', 0.947153627872467),
 ('근', 0.9458890557289124),
 ('유료로', 0.945405125617981),
 ('수천편의', 0.9439290761947632),
 ('단체관람으로', 0.9433076977729797)]

# Fasttext

In [12]:
# fasttext training

import os
from gensim.models import FastText

def fasttext(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    model = FastText(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims()

    model.save('fasttext')
    print('fasttext is trained')

fasttext(texts)



number of text =  136748
fasttext is trained


In [13]:
w2v = Word2Vec.load('word2vec')
fasttext = FastText.load('fasttext')

In [15]:
wav.wv.most_similar('고능학교')

NameError: ignored

In [16]:
fasttext.wv.most_similar('고능학교')

[('학교', 0.9610733985900879),
 ('중학교', 0.9520258903503418),
 ('고등학교', 0.9512345790863037),
 ('초등학교', 0.9367508888244629),
 ('국민학교', 0.9162280559539795),
 ('대학교', 0.9023886919021606),
 ('고등학교시절', 0.9002160429954529),
 ('초등학교5학년때', 0.8846424221992493),
 ('고등학교때', 0.8779880404472351),
 ('다닐', 0.8734228014945984)]