In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, '..')

import os
import definition
from embedding import Preprocessor, Embedding
import pandas as pd
import numpy as np

## Data Loading, Preprocessing, and Selection

In [2]:
df = pd.read_csv(definition.DATA_RAW_FILE)
df = df.drop_duplicates('review_id', 'first').reset_index().drop('index', axis=1)
print(len(df))
df.head()

144353


Unnamed: 0,review_id,review_text,review_source,review_score,category
0,73a965356fe43aa02b113b684f1a365c,Kotor berdebu. Saya tdk berhenti bersin ketika...,befc88fe53f24cb70ce7f9d2efd89858,5.5,kebersihan
1,35f506b353ff95aed86131fb01fb43bc,oke cuma air wastafel berwarna keruh mohon di...,4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas
2,e93324022b758ce67710708ac115cd18,kamar ada semutnya. kamar mandi bermasalah. bu...,4c5d06b02c97731aaa976179c62dcf76,2.0,kebersihan
3,dff13be030a1a10741848a2d9622cee4,"Kamar mandi bau, airnya bau",4c5d06b02c97731aaa976179c62dcf76,1.0,kebersihan
4,beeff65d3c2e32934261586493934f99,"kamarnya bersih dan nyaman, tetapi perlu di pe...",4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas


In [3]:
sampling_percentage = 0.6

index = np.arange(len(df))
np.random.shuffle(index)
index_train = index[:int(sampling_percentage * len(df))]

corpus = df['review_text'].values[index_train]
corpus = corpus.astype(str)
corpus[:5]

array(['Memuaskan untuk harga yang sangat terjangkau tapi bisa dekat dengan segala keperluan, ada stasiun, minimarket, dan banyak tempat makan',
       'Variasi sarapan, kelengkapan sunrise meal, koneksi wifi. ',
       'Kamar sesuai di foto yang dipaparkan, semua karyawannya sangat ramah, proses check in sangat mudah, ',
       'sangat nyaman,', 'nyaman bngett'], dtype='<U2067')

In [4]:
prep_corpus = []
for review in corpus:
    prep_corpus.append(Preprocessor.tokenize(review))
print(len(prep_corpus))
prep_corpus[:5]

86611


[['memuaskan',
  'untuk',
  'harga',
  'yang',
  'sangat',
  'terjangkau',
  'tapi',
  'bisa',
  'dekat',
  'dengan',
  'segala',
  'keperluan',
  ',',
  'ada',
  'stasiun',
  ',',
  'minimarket',
  ',',
  'dan',
  'banyak',
  'tempat',
  'makan'],
 ['variasi',
  'sarapan',
  ',',
  'kelengkapan',
  'sunrise',
  'meal',
  ',',
  'koneksi',
  'wifi',
  '.'],
 ['kamar',
  'sesuai',
  'di',
  'foto',
  'yang',
  'dipaparkan',
  ',',
  'semua',
  'karyawannya',
  'sangat',
  'ramah',
  ',',
  'proses',
  'check',
  'in',
  'sangat',
  'mudah',
  ','],
 ['sangat', 'nyaman', ','],
 ['nyaman', 'bngett']]

## Embedding Construction

In [5]:
size = 25
min_count = 5
min_n = 3
max_n = 3
iter_count = 10

In [6]:
embedding = Embedding(min_count=min_count, size=size, min_n=min_n, max_n=max_n, iter=iter_count)
embedding.build_vocab(prep_corpus)
embedding.train(prep_corpus, verbose=True)

Epoch #10/10...


In [7]:
embedding_filename = "fasttext_{}.bin".format(size)

embedding.save(os.path.join(definition.MODEL_EMBEDDING, embedding_filename))

In [8]:
emb = Embedding()
emb.load(os.path.join(definition.MODEL_EMBEDDING, embedding_filename))

In [9]:
emb.get_vector_word("bagus")

array([ 0.82963747,  2.3067234 ,  0.16771676,  2.150844  ,  0.890001  ,
        0.9580703 , -1.0738027 ,  3.7879589 , -1.9150782 ,  2.1498172 ,
        2.7978327 , -3.30842   ,  0.65501016,  2.2940295 , -1.166068  ,
        0.19644837,  1.7885561 ,  3.745247  ,  2.09786   ,  0.08136509,
       -1.3206955 , -0.18061733, -0.7735384 ,  0.65012354, -1.2130476 ],
      dtype=float32)