# FastText model
> * 페이스북에서 개발한 단어 임베딩 기술  
> * 구글에서 개발한 Word2Vec을 기본으로 하되 부분단어들을 Embedding하는 기법.  
> * 단어가 가지는 형태 정보를 학습할 수 있어, 다양한 접사가 존재하는 한국어같은 언어에 대해서 잘 동작  

In [1]:
import pickle
import numpy as np
from tqdm import tqdm
from numba import jit
from konlpy.utils import pprint

## FastText model 생성을 위한 함수 정의

In [2]:
import multiprocessing
cores = multiprocessing.cpu_count()
def Make_FastText_Model(data, size, epoch, sg, window, min_count, cbow_mean,
                        workers, negative,word_ngrams, hs, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import FastText
    start = datetime.now()
    modelPath = './model/'
    modelName = 'fastText_size-{}_epoch-{}_ngrams-{}_window-{}_negative-{}_hs-{}_sg-{}_cbow_mean-{}_min_count-{}_by-{}.model'.format(
        size, epoch, word_ngrams, window, negative, hs, sg, cbow_mean, min_count, tagger)
    modelName = modelPath+modelName
    print (modelName)
    fastText_model = FastText(size = size, sg = sg, cbow_mean = cbow_mean,
                                  negative = negative, hs = hs, window = window, word_ngrams=word_ngrams, 
                                  workers = workers, iter=epoch, min_count = min_count)
    fastText_model.build_vocab(tqdm(data))
    fastText_model.train(tqdm(data), total_examples=fastText_model.corpus_count, epochs=fastText_model.iter) 
    fastText_model.init_sims(replace = True)
    fastText_model.save(modelName)
    end = datetime.now()
    print ("Total running time: ", end-start)
    return fastText_model

## rawdata

In [None]:
import pandas as pd

In [None]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None,encoding='utf-8')
print (rawdata.shape)

In [None]:
rawdata.head()

### STOPWORDS 

In [None]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

### FastText 포맷으로 만들기

In [None]:
def Tagging(sentence, tagger, stopwords):
    pos = tagger.pos(sentence)
    pos = [x[0] for x in pos]
    pos = [x for x in pos if not x in stopwords]
    return pos

In [None]:
@jit
def Tagging2(sentence, tagger, stopwords):
    pos = pd.Series(tagger.pos(sentence)).str[0]
    pos = pos[~pos.isin(stopwords)]
    return pos.tolist()

In [None]:
def Make_Pre_Data_Sub(series, tagger, stopwords):
    from gensim.models.doc2vec import TaggedDocument
    pos = Tagging2(series[0], tagger, stopwords)
    label = series[1]
    return TaggedDocument(pos, [label])

In [None]:
@jit
def Make_Pre_Data(rawdata, tagger, stopwords):
    outList = list()
    for idx in tqdm(rawdata.index):
        outList.append([Tagging2(rawdata.loc[idx, 0], tagger, stopwords), rawdata.loc[idx, 1]])
    return outList

#### Tagging Twitter

In [None]:
from ckonlpy.tag import Twitter as ctwitter
ct = ctwitter()

In [None]:
%%time
tqdm.pandas(desc="progress")
pre_data = rawdata.progress_apply(lambda x: Make_Pre_Data_Sub(x, ct, stopwords), axis = 1).tolist()

In [None]:
pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_word2vec_sentiment_by_ct.pickled','wb'))

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [None]:
del pre_data

In [None]:
pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_ct.pickled','wb'))
pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_ct.pickled','wb'))

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_ct.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_ct.pickled','rb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_train = [ x.tags for x in tqdm(train)] 
x_test = [ x.words for x in tqdm(test)] 
y_test = [ x.tags for x in tqdm(test)] 

In [None]:
del train
del test

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=1, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=1, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'ct')


In [None]:
del model

In [None]:
del ct

#### Tagging Mecab

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
%%time
tqdm.pandas(desc="progress-bar")
pre_data = rawdata.progress_apply(lambda x: Make_Pre_Data_Sub(x, mecab, stopwords), axis = 1).tolist()

In [None]:
%%time
pre_data = Make_Pre_Data(rawdata, mecab, stopwords)

In [None]:
pickle.dump(pre_data,open('./data/pre_data/tagged_data/pre_data__for_word2vec_sentiment_by_mecab.pickled','wb'))

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(pre_data, test_size = 0.1)

In [None]:
del pre_data

In [None]:
pickle.dump(train,open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_mecab.pickled','wb'))
pickle.dump(test,open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_mecab.pickled','wb'))

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_train_for_word2vec_sentiment_by_mecab.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_test_for_word2vec_sentiment_by_mecab.pickled','rb'))

In [None]:
x_train = [ x.words for x in tqdm(train)] 
y_train = [ x.tags for x in tqdm(train)] 
x_test = [ x.words for x in tqdm(test)] 
y_test = [ x.tags for x in tqdm(test)] 

In [None]:
del train
del test

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=1, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=0, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
%%time
model = Make_FastText_Model(data=x_train, size=1000, epoch=20, 
                    sg=1, window=10, min_count=2, cbow_mean=0, workers=cores, 
                   negative = 7, word_ngrams = 3, hs = 0 , tagger = 'mecab')


In [None]:
del model

In [None]:
del mecab