In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, '..')

import os
import definition
from utility import Preprocessor, Embedding
import pandas as pd
import numpy as np

## Data Loading, Preprocessing, and Selection

In [2]:
df = pd.read_csv(definition.DATA_RAW_FILE)
df = df.drop_duplicates('review_id', 'first').reset_index().drop('index', axis=1)
print(len(df))
df.head()

144353


Unnamed: 0,review_id,review_text,review_source,review_score,category
0,73a965356fe43aa02b113b684f1a365c,Kotor berdebu. Saya tdk berhenti bersin ketika...,befc88fe53f24cb70ce7f9d2efd89858,5.5,kebersihan
1,35f506b353ff95aed86131fb01fb43bc,oke cuma air wastafel berwarna keruh mohon di...,4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas
2,e93324022b758ce67710708ac115cd18,kamar ada semutnya. kamar mandi bermasalah. bu...,4c5d06b02c97731aaa976179c62dcf76,2.0,kebersihan
3,dff13be030a1a10741848a2d9622cee4,"Kamar mandi bau, airnya bau",4c5d06b02c97731aaa976179c62dcf76,1.0,kebersihan
4,beeff65d3c2e32934261586493934f99,"kamarnya bersih dan nyaman, tetapi perlu di pe...",4c5d06b02c97731aaa976179c62dcf76,3.0,fasilitas


In [3]:
sampling_percentage = 0.6

index = np.arange(len(df))
np.random.shuffle(index)
index_train = index[:int(sampling_percentage * len(df))]

corpus = df['review_text'].values[index_train]
corpus = corpus.astype(str)
corpus[:5]

array(['Harga dan kebutuhan sesuai keinginan',
       'pelayanan ramah, pasti akan nginap dsini lg klo k Banjar \n',
       'sangat memuaskan....thank you airy...',
       'd tengah kota dekat k mana2. alun2 jln kaki... pelayanan baik dan ramah bersih lg dan tentunya dapet diskon dr AIRY',
       'minta kamar plg bawah krn bw ortu ,dikasih lt 1 ,awalna lt 2,tol diperhatikan note dr pemesan'],
      dtype='<U2863')

In [4]:
prep_corpus = []
for review in corpus:
    prep_corpus.append(Preprocessor.tokenize(review))
print(len(prep_corpus))
prep_corpus[:5]

86611


[['harga', 'dan', 'kebutuhan', 'sesuai', 'keinginan'],
 ['pelayanan',
  'ramah',
  ',',
  'pasti',
  'akan',
  'nginap',
  'dsini',
  'lg',
  'klo',
  'k',
  'banjar'],
 ['sangat', 'memuaskan', '.', 'thank', 'you', 'airy', '.'],
 ['d',
  'tengah',
  'kota',
  'dekat',
  'k',
  'mana2',
  '.',
  'alun2',
  'jln',
  'kaki',
  '.',
  'pelayanan',
  'baik',
  'dan',
  'ramah',
  'bersih',
  'lg',
  'dan',
  'tentunya',
  'dapet',
  'diskon',
  'dr',
  'airy'],
 ['minta',
  'kamar',
  'plg',
  'bawah',
  'krn',
  'bw',
  'ortu',
  ',',
  'dikasih',
  'lt',
  '1',
  ',',
  'awalna',
  'lt',
  '2',
  ',',
  'tol',
  'diperhatikan',
  'note',
  'dr',
  'pemesan']]

## Embedding Construction

In [11]:
size = 25
min_count = 5
min_n = 3
max_n = 3
iter_count = 10

In [12]:
# embedding = Embedding(min_count=min_count, size=size, min_n=min_n, max_n=max_n, iter=iter_count) #FastText
embedding = Embedding(fast_text=False, min_count=min_count, size=size, iter=iter_count) #Word2Vec
embedding.build_vocab(prep_corpus)
embedding.train(prep_corpus, verbose=True)

Epoch #10/10...


In [13]:
embedding.model

<gensim.models.word2vec.Word2Vec at 0x1cbecc70a90>

In [14]:
embedding_filename = "word2vec_{}.bin".format(size)

embedding.save(os.path.join(definition.MODEL_UTILITY, embedding_filename))

In [15]:
emb = Embedding()
emb.load(os.path.join(definition.MODEL_UTILITY, embedding_filename))

In [16]:
emb.get_vector_word("srpan")

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)