In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, '..')

import os
import definition
from utility import Preprocessor, Embedding
import pandas as pd
import numpy as np

## Data Loading, Preprocessing, and Selection

In [2]:
df = pd.read_csv(definition.DATA_RAW_FILE)
df = df.drop_duplicates('review_id', 'first').reset_index().drop('index', axis=1)
print(len(df))
df.head()

144353


Unnamed: 0,review_id,review_text,review_source,review_score,category
0,73a965356fe43aa02b113b684f1a365c,Kotor berdebu. Saya tdk berhenti bersin ketika...,befc88fe53f24cb70ce7f9d2efd89858,5.5,kebersihan
1,35f506b353ff95aed86131fb01fb43bc,oke cuma air wastafel berwarna keruh mohon di...,4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas
2,e93324022b758ce67710708ac115cd18,kamar ada semutnya. kamar mandi bermasalah. bu...,4c5d06b02c97731aaa976179c62dcf76,2.0,kebersihan
3,dff13be030a1a10741848a2d9622cee4,"Kamar mandi bau, airnya bau",4c5d06b02c97731aaa976179c62dcf76,1.0,kebersihan
4,beeff65d3c2e32934261586493934f99,"kamarnya bersih dan nyaman, tetapi perlu di pe...",4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas


In [3]:
sampling_percentage = 0.6

index = np.arange(len(df))
np.random.shuffle(index)
index_train = index[:int(sampling_percentage * len(df))]

corpus = df['review_text'].values[index_train]
corpus = corpus.astype(str)
corpus[:5]

array(['Biasa saja dr kamar dan kebersihan',
       'overall menyenangkan. hanya kekurangannya saat sy datang, kelengkapan hotel spt handuk dan toiletries harus diminta dlu baru diberikan, katanya OB nya lupa meletakkan yg baru saat bersih2 ruangan. sisanya sangat baik. ada lift dan kulkas. tv kabel koneksi kurang bagus tp ga masalah.',
       'Puas dan tidur nyaman sampai di tempat untuk breakfast puas sekali!',
       'horror and not comfortable place',
       'serba baru pelengkap kamar nya masih kurang beberapa'],
      dtype='<U1886')

In [4]:
prep_corpus = []
for review in corpus:
    prep_corpus.append(Preprocessor.tokenize(review))
print(len(prep_corpus))
prep_corpus[:5]

86611


[['biasa', 'saja', 'dr', 'kamar', 'dan', 'kebersihan'],
 ['overall',
  'menyenangkan',
  '.',
  'hanya',
  'kekurangannya',
  'saat',
  'sy',
  'datang',
  ',',
  'kelengkapan',
  'hotel',
  'spt',
  'handuk',
  'dan',
  'toiletries',
  'harus',
  'diminta',
  'dlu',
  'baru',
  'diberikan',
  ',',
  'katanya',
  'ob',
  'nya',
  'lupa',
  'meletakkan',
  'yg',
  'baru',
  'saat',
  'bersih2',
  'ruangan',
  '.',
  'sisanya',
  'sangat',
  'baik',
  '.',
  'ada',
  'lift',
  'dan',
  'kulkas',
  '.',
  'tv',
  'kabel',
  'koneksi',
  'kurang',
  'bagus',
  'tp',
  'ga',
  'masalah',
  '.'],
 ['puas',
  'dan',
  'tidur',
  'nyaman',
  'sampai',
  'di',
  'tempat',
  'untuk',
  'breakfast',
  'puas',
  'sekali',
  '!'],
 ['horror', 'and', 'not', 'comfortable', 'place'],
 ['serba', 'baru', 'pelengkap', 'kamar', 'nya', 'masih', 'kurang', 'beberapa']]

## Embedding Construction

In [5]:
size = 25
min_count = 5
min_n = 3
max_n = 3
iter_count = 10

In [6]:
embedding = Embedding(min_count=min_count, size=size, min_n=min_n, max_n=max_n, iter=iter_count)
embedding.build_vocab(prep_corpus)
embedding.train(prep_corpus, verbose=True)

Epoch #10/10...


In [7]:
embedding_filename = "fasttext_{}.bin".format(size)

embedding.save(os.path.join(definition.MODEL_UTILITY, embedding_filename))

In [8]:
emb = Embedding()
emb.load(os.path.join(definition.MODEL_UTILITY, embedding_filename))

In [9]:
emb.get_vector_word("bagus")

array([ 1.1372153 , -1.2164351 ,  2.200044  , -0.996471  , -1.0261704 ,
       -0.44203377,  1.9339594 ,  3.0238674 ,  1.0220982 , -0.1632101 ,
        0.48570085, -3.2179549 ,  1.1257932 ,  3.6297328 , -0.79633236,
        1.7778121 , -1.8365582 ,  0.7182886 ,  1.4898195 , -2.9580905 ,
        1.5089282 ,  1.4625043 ,  2.4580147 ,  1.6280187 , -0.7348065 ],
      dtype=float32)