# FastText model
> * 페이스북에서 개발한 단어 임베딩 기술  
> * 구글에서 개발한 fastText을 기본으로 하되 부분단어들을 Embedding하는 기법.  
> * 단어가 가지는 형태 정보를 학습할 수 있어, 다양한 접사가 존재하는 한국어같은 언어에 대해서 잘 동작  
> * The main principle behind fastText is that the morphological structure of a word carries important information about the meaning of the word, which is not taken into account by traditional word embeddings, which train a unique word embedding for every individual word. This is especially significant for morphologically rich languages (German, Turkish) in which a single word can have a large number of morphological forms, each of which might occur rarely, thus making it hard to train good word embeddings.
> * fastText attempts to solve this by treating each word as the aggregation of its subwords. For the sake of simplicity and language-independence, subwords are taken to be the character ngrams of the word. The vector for a word is simply taken to be the sum of all vectors of its component char-ngrams.
> * According to a detailed comparison of fastText and FastText in this notebook, fastText does significantly better on syntactic tasks as compared to the original fastText, especially when the size of the training corpus is small. fastText slightly outperforms FastText on semantic tasks though. The differences grow smaller as the size of training corpus increases. Training time for fastText is significantly higher than the Gensim version of word2vec (15min 42s vs 6min 42s on text8, 17 mil tokens, 5 epochs, and a vector size of 100).
> * fastText can be used to obtain vectors for out-of-vocabulary (OOV) words, by summing up vectors for its component char-ngrams, provided at least one of the char-ngrams was present in the training data.

In [1]:
import pickle
import os
import numpy as np
from tqdm import tqdm
from numba import jit
from konlpy.utils import pprint

## FastText model 생성을 위한 함수 정의

In [2]:
import multiprocessing
cores = multiprocessing.cpu_count()
def Make_FastText_Model(data, size, epoch, sg, window, min_count, cbow_mean,
                        workers, negative,word_ngrams, hs, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import FastText
    start = datetime.now()
    modelPath = './model/'
    modelName = 'fastText_size-{}_epoch-{}_ngrams-{}_window-{}_negative-{}_hs-{}_sg-{}_cbow_mean-{}_min_count-{}_by-{}.model'.format(
        size, epoch, word_ngrams, window, negative, hs, sg, cbow_mean, min_count, tagger)
    modelName = modelPath+modelName
    print (modelName)
    fastText_model = FastText(size = size, sg = sg, cbow_mean = cbow_mean,
                                  negative = negative, hs = hs, window = window, word_ngrams=word_ngrams, 
                                  workers = workers, iter=epoch, min_count = min_count)
    fastText_model.build_vocab(tqdm(data))
    fastText_model.train(tqdm(data), total_examples=fastText_model.corpus_count, epochs=fastText_model.iter) 
    fastText_model.init_sims(replace = True)
    fastText_model.save(modelName)
    end = datetime.now()
    print ("Total running time: ", end-start)
    return fastText_model

## rawdata

In [3]:
import pandas as pd

In [4]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None,encoding='utf-8')
print (rawdata.shape)

(491510, 2)


In [5]:
rawdata.head()

Unnamed: 0,0,1
0,주택조합 승인 자체가 몹시 어려웠던 시절 한국기자협회 주택조합 형식으로 추진된 이 ...,1
1,"서 의원은 여의도 회견을 열어 “압도적으로 승리할 것이고 국민, 당원이 주문하신 ...",1
2,박근혜 대통령이 14일 열린 새누리당 전당대회에 참석한 것은 그만큼 아직 여권에서 ...,1
3,"“살기 좋은 전북 만들기, 중앙정부와 연결고리 최선” “월드컵에 출전한 우리 국가대...",1
4,"""여러분 너무나 고맙고 감사합니다. 도와주신 여러분께 저의 마음속의 큰 절을 받아주...",1


### STOPWORDS 

In [6]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

### FastText 포맷으로 만들기

In [7]:
def Tagging(sentence, tagger, stopwords):
    pos = tagger.pos(sentence)
    pos = [x[0] for x in pos]
    pos = [x for x in pos if not x in stopwords]
    return pos

In [8]:
@jit
def Tagging2(sentence, tagger, stopwords):
    pos = pd.Series(tagger.pos(sentence)).str[0]
    pos = pos[~pos.isin(stopwords)]
    return pos.tolist()

In [9]:
def Make_Pre_Data_Sub(series, tagger, stopwords):
    from gensim.models.doc2vec import TaggedDocument
    pos = Tagging2(series[0], tagger, stopwords)
    label = series[1]
    return TaggedDocument(pos, [label])

In [10]:
from collections import namedtuple
from gensim.models.doc2vec import TaggedDocument
TaggedDocument = namedtuple('TaggedDocument', 'words tags sentiment')
@jit
def Make_Pre_Data(rawdata, tagger, stopwords):
    outList = list()
    for idx in tqdm(rawdata.index):
        outList.append([Tagging2(rawdata.loc[idx, 0], tagger, stopwords), ['doc_'+str(idx)], [rawdata.loc[idx, 1]]])
    return outList

#### Tagging Twitter

In [None]:
from ckonlpy.tag import Twitter as ctwitter
ct = ctwitter()

In [None]:
%%time
tqdm.pandas(desc="progress")
pre_data = Make_Pre_Data(rawdata, ct, stopwords)

In [None]:
%%time
pre_data = [TaggedDocument(b, c, d) for b, c, d in tqdm(pre_data)]

In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_ct.pickled'):
    pre_data = pickle.load(open('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_ct.pickled','rb'))
else:
    pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_ct.pickled','wb'))

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [None]:
del pre_data

In [None]:
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_ct.pickled'):
    train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_ct.pickled','rb'))
else:
    pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_ct.pickled','wb'))
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_ct.pickled'):
    test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_ct.pickled','rb'))
else:
    pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_ct.pickled','wb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_senti_train = [ x.sentiment for x in tqdm(train)] 
y_tags_train = [x.tags for x in tqdm(train)]
x_test = [ x.words for x in tqdm(test)] 
y_senti_test = [ x.sentiment for x in tqdm(test)] 
y_tags_test = [x.tags for x in tqdm(test)]

In [None]:
del train
del test

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=1, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=1, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
del ct

#### Tagging Mecab

In [11]:
from konlpy.tag import Mecab
mecab = Mecab()

In [12]:
%%time
tqdm.pandas(desc="progress")
pre_data = Make_Pre_Data(rawdata, mecab, stopwords)

100%|██████████| 491510/491510 [24:25<00:00, 335.28it/s]

CPU times: user 21min 46s, sys: 1min 57s, total: 23min 44s
Wall time: 24min 26s





In [None]:
%%time
pre_data = [TaggedDocument(b, c, d) for b, c, d in tqdm(pre_data)]

100%|██████████| 491510/491510 [00:31<00:00, 15587.92it/s] 

CPU times: user 5.52 s, sys: 17.9 s, total: 23.4 s
Wall time: 31.5 s





In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_mecab.pickled'):
    pre_data = pickle.load(open('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_mecab.pickled','rb'))
else:
    pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_fastText_sentiment_by_mecab.pickled','wb'))

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [None]:
del pre_data

In [None]:
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_mecab.pickled'):
    train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_mecab.pickled','rb'))
else:
    pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_mecab.pickled','wb'))
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_mecab.pickled'):
    test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_mecab.pickled','rb'))
else:
    pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_fastText_sentiment_by_mecab.pickled','wb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_senti_train = [ x.sentiment for x in tqdm(train)] 
y_tags_train = [x.tags for x in tqdm(train)]
x_test = [ x.words for x in tqdm(test)] 
y_senti_test = [ x.sentiment for x in tqdm(test)] 
y_tags_test = [x.tags for x in tqdm(test)]

In [None]:
del train
del test

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=1, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=1, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
del mecab