In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**List Library**

In [None]:
# Importing library
import numpy as np
import pandas as pd

# BeautifulSoup = hapus tag html
from bs4 import BeautifulSoup 
import re # regular expressions (regex)

# natural language tool kits
from nltk.corpus import stopwords
import nltk

# word2vec library
from gensim.models import word2vec
import itertools

# library untuk Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# path dataset
data_path = r'/content/drive/MyDrive/Text Dataset/IMDB Dataset.csv'
dataset = pd.read_csv(data_path, header=0, sep = ',')

# splitting data
train = dataset[0:39999]
test = dataset[40000:49999]

**Preprocessing Dataset**

In [None]:
# proprocessing data teks
def review_wordlist(review, remove_stopwords=False):
    
    # hapus tag html
    review_text = BeautifulSoup(review).get_text()
    
    # hapus simbol
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # konversi ke huruf kecil dan dipisah perkata
    words = review_text.lower().split()

    # menghapus stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [None]:
# download file punctuation and stpwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Memisah review per kalimat
def review_sentences(review, tokenizer, remove_stopwords=False):
    
    # melakukan tokenize dengan nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    # mengisi array sentences dengan masing - masing review
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # list dari list
    return sentences

In [None]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
print("List dari lists. Cek tipe data : ", type(sentences), " of ", type(sentences[0]))
print(sentences[10])
print(len(sentences))

List dari lists. Cek tipe data :  <class 'list'>  of  <class 'list'>
['a', 'wonderful', 'little', 'production']
428144


**SEARCH BEST COMBINATION PARAMETER**

In [None]:
best_score = []
best_parameter = []

In [None]:
# parameter yang akan dioptimasi
parm_dict = {
    'workers' : (4,),
    'size' : (300, 350, 400, 450, 500),
    'min_count' : (40,),
    'window' : (10,)
}

# melakukan x, dengan metode y, untuk z 
# optimization of feature dimension on word embedding based on vector size using grid search for sentimen analysis

def cust_param_search(parm_dict):

    score_best, parm_best = 0,()
    workers, size, min_count, window = [tup for k,tup in parm_dict.items()] # Individual parm tuples
    parm_combo = list(itertools.product(workers, size, min_count, window)) # Create all combinations
    
    print('\n==============================================================')
    print('PARAMETER')
    print('==============================================================')
    print(f'Total kombinasi parameter   : {len(parm_combo)}')
    print(f'Semua kombinasi parameter   : {parm_combo}')

    # Grid search
    i = 1
    for parms in parm_combo:

        print('\n==============================================================')
        print(f'{i}.\t Parameter : {parms}')
        print('==============================================================')
        
        w, s, m, wi = parms
        
        # =================================================================================
        # word2vec mulai disini
        # =================================================================================
        
        # word2vec model
        model = word2vec.Word2Vec(workers = w, 
                              size = s, 
                              min_count = m, 
                              window = wi)
        model.build_vocab(sentences)

        # training model word2vec (CBOW, karena parameter 'sg' menggunakan nilai default)
        print('\n==============================================================')
        print("Training model Word2Vec ...")
        print('==============================================================')
        model.train(sentences = sentences, total_examples = len(sentences), epochs = 5)

        # Function to average all word vectors in a paragraph
        def featureVecMethod(words, model, num_features):
            # Pre-initialising empty numpy array for speed
            featureVec = np.zeros(s,dtype="float32")
            nwords = 0
            
            #Converting Index2Word which is a list to a set for better speed in the execution.
            index2word_set = set(model.wv.index2word)
            
            for word in  words:
                if word in index2word_set:
                    nwords = nwords + 1
                    featureVec = np.add(featureVec,model[word])
            
            # Dividing the result by number of words to get average
            featureVec = np.divide(featureVec, nwords)
            return featureVec

        # Function for calculating the average feature vector
        def getAvgFeatureVecs(reviews, model, num_features):
            counter = 0
            reviewFeatureVecs = np.zeros((len(reviews),s),dtype="float32")
            for review in reviews:
                # Printing a status message every 1000th review
                if counter%1000 == 0:
                    print("Review %d of %d"%(counter,len(reviews)))
                    
                reviewFeatureVecs[counter] = featureVecMethod(review, model, s)
                counter = counter+1
                
            return reviewFeatureVecs

        # Calculating average feature vector (mendapatkan vektor training set)
        clean_train_reviews = []
        for review in train['review']:
            clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
            
        trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, s)

        # Calculating average feature vactors (mendapatkan vektor test set)     
        clean_test_reviews = []
        for review in test["review"]:
            clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
            
        testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, s)

        # tujuan masing-masing model word2vec, untuk mendapatkan 'trainDataVecs' dan 'testDataVecs' 
        # untuk diolah lebih lanjut di classifier
        
        # =================================================================================
        # word2vec berakhir disini
        # =================================================================================

        print('\n==============================================================')
        print('Sentiment Analysis Process ...')
        print('==============================================================')
        # Memanggil classifier
        score, report = classification_model(trainDataVecs, testDataVecs, train["sentiment"], test["sentiment"])

        # temporary print
        print('\n==============================================================')
        print(f'Hasil Test Parameter ke - {i}')
        print(f'Parameter  => {parms}')
        print(f'Accuracy   => {score}')
        print(f'Classification Report => \n {report}')
        print('==============================================================')

        best_score.append(score)
        best_parameter.append(parms)
        
        if score > score_best:
            score_best = score
            parm_best = parms
        
        i = i + 1
    
    print('\n==============================================================')
    print(f'Best Parameter  => {parm_best}')
    print(f'Accuracy   => {score_best}')
    print('==============================================================')

In [None]:
def classification_model(train_feature_vec, test_feature_vec, train_sentiment, test_sentiment):

    # Model/classifier yang dipakai bebas
    # Fit random forest classifier ke data training
    forest = RandomForestClassifier(n_estimators = 100)    
    forest = forest.fit(train_feature_vec, train_sentiment) 

    # Prediksi nilai sentiment untuk test data 
    predicted = forest.predict(test_feature_vec)

    # akurasi
    accuracy = accuracy_score(test_sentiment, predicted)
    report = classification_report(test_sentiment, predicted, digits = 5)
    
    return accuracy, report

In [None]:
# test drive ma men (estimasi running : 17 menit)
cust_param_search(parm_dict)

2020-12-22 15:08:26,791 : INFO : collecting all words and their counts
2020-12-22 15:08:26,792 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:08:26,851 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:08:26,908 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:08:26,967 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



PARAMETER
Total kombinasi parameter   : 7
Semua kombinasi parameter   : [(4, 300, 40, 10), (4, 350, 40, 10), (4, 400, 40, 10), (4, 450, 40, 10), (4, 500, 40, 10), (4, 550, 40, 10), (4, 600, 40, 10)]

1.	 Parameter : (4, 300, 40, 10)


2020-12-22 15:08:27,029 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:08:27,092 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:08:27,151 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:08:27,209 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:08:27,268 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:08:27,330 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:08:27,398 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:08:27,473 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:08:27,538 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:08:33,103 : INFO : EPOCH 1 - PROGRESS: at 5.24% examples, 332533 words/s, in_qsize 7, out_qsize 1
2020-12-22 15:08:34,134 : INFO : EPOCH 1 - PROGRESS: at 10.73% examples, 342093 words/s, in_qsize 7, out_qsize 1
2020-12-22 15:08:35,146 : INFO : EPOCH 1 - PROGRESS: at 16.20% examples, 347376 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:08:36,164 : INFO : EPOCH 1 - PROGRESS: at 21.67% examples, 347892 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:08:37,175 : INFO : EPOCH 1 - PROGRESS: at 27.22% examples, 349822 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:08:38,208 : INFO : EPOCH 1 - PROGRESS: at 32.69% examples, 349918 words/s, in_qsize 4, out_qsize 3
2020-12-22 15:08:39,209 : INFO : EPOCH 1 - PROGRESS: at 38.36% examples, 351776 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:08:40,247 : INFO : EPOCH 1 - PROGRESS: at 43.75% examples, 350597 words/s, in_qsize 5, out_qsize 2
2020-12-22 15:08:41,271 : INFO : EPOCH 1 - PROGRESS: at 49.33% examples, 351042 words/s, in_qsize

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:13:28,513 : INFO : collecting all words and their counts
2020-12-22 15:13:28,515 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:13:28,571 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:13:28,620 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:13:28,676 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 1
Parameter  => (4, 300, 40, 10)
Accuracy   => 0.8283828382838284
Classification Report => 
               precision    recall  f1-score   support

    negative    0.84239   0.80729   0.82447      4992
    positive    0.81553   0.84941   0.83213      5007

    accuracy                        0.82838      9999
   macro avg    0.82896   0.82835   0.82830      9999
weighted avg    0.82894   0.82838   0.82830      9999


2.	 Parameter : (4, 350, 40, 10)


2020-12-22 15:13:28,733 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:13:28,791 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:13:28,853 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:13:28,907 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:13:28,966 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:13:29,024 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:13:29,080 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:13:29,132 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:13:29,186 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:13:34,556 : INFO : EPOCH 1 - PROGRESS: at 4.39% examples, 274269 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:13:35,575 : INFO : EPOCH 1 - PROGRESS: at 8.83% examples, 280839 words/s, in_qsize 8, out_qsize 0
2020-12-22 15:13:36,590 : INFO : EPOCH 1 - PROGRESS: at 13.22% examples, 284267 words/s, in_qsize 8, out_qsize 1
2020-12-22 15:13:37,605 : INFO : EPOCH 1 - PROGRESS: at 17.70% examples, 284485 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:13:38,667 : INFO : EPOCH 1 - PROGRESS: at 22.29% examples, 284045 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:13:39,682 : INFO : EPOCH 1 - PROGRESS: at 27.01% examples, 286940 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:13:40,700 : INFO : EPOCH 1 - PROGRESS: at 31.33% examples, 286096 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:13:41,701 : INFO : EPOCH 1 - PROGRESS: at 35.51% examples, 284391 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:13:42,755 : INFO : EPOCH 1 - PROGRESS: at 39.75% examples, 281407 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:19:01,369 : INFO : collecting all words and their counts
2020-12-22 15:19:01,376 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:19:01,426 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:19:01,476 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:19:01,531 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 2
Parameter  => (4, 350, 40, 10)
Accuracy   => 0.8264826482648265
Classification Report => 
               precision    recall  f1-score   support

    negative    0.83977   0.80629   0.82269      4992
    positive    0.81425   0.84661   0.83012      5007

    accuracy                        0.82648      9999
   macro avg    0.82701   0.82645   0.82640      9999
weighted avg    0.82699   0.82648   0.82641      9999


3.	 Parameter : (4, 400, 40, 10)


2020-12-22 15:19:01,590 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:19:01,654 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:19:01,712 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:19:01,768 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:19:01,826 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:19:01,883 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:19:01,939 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:19:01,993 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:19:02,047 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:19:07,465 : INFO : EPOCH 1 - PROGRESS: at 4.72% examples, 298818 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:08,500 : INFO : EPOCH 1 - PROGRESS: at 9.57% examples, 304459 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:09,530 : INFO : EPOCH 1 - PROGRESS: at 14.25% examples, 304367 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:19:10,572 : INFO : EPOCH 1 - PROGRESS: at 19.32% examples, 307034 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:11,595 : INFO : EPOCH 1 - PROGRESS: at 24.08% examples, 307465 words/s, in_qsize 7, out_qsize 1
2020-12-22 15:19:12,587 : INFO : EPOCH 1 - PROGRESS: at 29.01% examples, 309480 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:13,631 : INFO : EPOCH 1 - PROGRESS: at 33.97% examples, 310149 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:14,635 : INFO : EPOCH 1 - PROGRESS: at 38.90% examples, 310464 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:19:15,640 : INFO : EPOCH 1 - PROGRESS: at 43.65% examples, 310677 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:24:36,081 : INFO : collecting all words and their counts
2020-12-22 15:24:36,082 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:24:36,142 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:24:36,200 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:24:36,257 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 3
Parameter  => (4, 400, 40, 10)
Accuracy   => 0.8294829482948295
Classification Report => 
               precision    recall  f1-score   support

    negative    0.84275   0.80950   0.82579      4992
    positive    0.81726   0.84941   0.83302      5007

    accuracy                        0.82948      9999
   macro avg    0.83000   0.82945   0.82941      9999
weighted avg    0.82999   0.82948   0.82941      9999


4.	 Parameter : (4, 450, 40, 10)


2020-12-22 15:24:36,317 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:24:36,380 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:24:36,436 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:24:36,492 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:24:36,549 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:24:36,605 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:24:36,661 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:24:36,716 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:24:36,771 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:24:42,169 : INFO : EPOCH 1 - PROGRESS: at 4.82% examples, 296489 words/s, in_qsize 7, out_qsize 2
2020-12-22 15:24:43,207 : INFO : EPOCH 1 - PROGRESS: at 9.88% examples, 309212 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:24:44,234 : INFO : EPOCH 1 - PROGRESS: at 14.87% examples, 314549 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:24:45,247 : INFO : EPOCH 1 - PROGRESS: at 19.84% examples, 315083 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:24:46,260 : INFO : EPOCH 1 - PROGRESS: at 24.96% examples, 318095 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:24:47,266 : INFO : EPOCH 1 - PROGRESS: at 30.05% examples, 320376 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:24:48,274 : INFO : EPOCH 1 - PROGRESS: at 34.96% examples, 320203 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:24:49,289 : INFO : EPOCH 1 - PROGRESS: at 40.07% examples, 320488 words/s, in_qsize 7, out_qsize 1
2020-12-22 15:24:50,307 : INFO : EPOCH 1 - PROGRESS: at 45.16% examples, 321467 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:30:09,137 : INFO : collecting all words and their counts
2020-12-22 15:30:09,138 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:30:09,196 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:30:09,246 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:30:09,302 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 4
Parameter  => (4, 450, 40, 10)
Accuracy   => 0.831983198319832
Classification Report => 
               precision    recall  f1-score   support

    negative    0.84328   0.81490   0.82885      4992
    positive    0.82145   0.84901   0.83500      5007

    accuracy                        0.83198      9999
   macro avg    0.83237   0.83196   0.83193      9999
weighted avg    0.83235   0.83198   0.83193      9999


5.	 Parameter : (4, 500, 40, 10)


2020-12-22 15:30:09,362 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:30:09,419 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:30:09,473 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:30:09,530 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:30:09,599 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:30:09,664 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:30:09,729 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:30:09,789 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:30:09,850 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:30:15,367 : INFO : EPOCH 1 - PROGRESS: at 3.63% examples, 236024 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:30:16,391 : INFO : EPOCH 1 - PROGRESS: at 7.56% examples, 244104 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:30:17,417 : INFO : EPOCH 1 - PROGRESS: at 11.66% examples, 251125 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:30:18,427 : INFO : EPOCH 1 - PROGRESS: at 15.41% examples, 250468 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:30:19,509 : INFO : EPOCH 1 - PROGRESS: at 19.63% examples, 250681 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:30:20,510 : INFO : EPOCH 1 - PROGRESS: at 23.63% examples, 252910 words/s, in_qsize 8, out_qsize 0
2020-12-22 15:30:21,510 : INFO : EPOCH 1 - PROGRESS: at 27.43% examples, 251627 words/s, in_qsize 7, out_qsize 1
2020-12-22 15:30:22,541 : INFO : EPOCH 1 - PROGRESS: at 31.53% examples, 253130 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:30:23,546 : INFO : EPOCH 1 - PROGRESS: at 35.50% examples, 253525 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:36:17,209 : INFO : collecting all words and their counts
2020-12-22 15:36:17,212 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:36:17,267 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:36:17,318 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:36:17,372 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 5
Parameter  => (4, 500, 40, 10)
Accuracy   => 0.8310831083108311
Classification Report => 
               precision    recall  f1-score   support

    negative    0.84413   0.81150   0.82749      4992
    positive    0.81904   0.85061   0.83453      5007

    accuracy                        0.83108      9999
   macro avg    0.83159   0.83105   0.83101      9999
weighted avg    0.83157   0.83108   0.83102      9999


6.	 Parameter : (4, 550, 40, 10)


2020-12-22 15:36:17,430 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:36:17,490 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:36:17,541 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:36:17,593 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:36:17,652 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:36:17,708 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:36:17,762 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:36:17,816 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:36:17,870 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:36:23,421 : INFO : EPOCH 1 - PROGRESS: at 3.75% examples, 239944 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:24,427 : INFO : EPOCH 1 - PROGRESS: at 7.96% examples, 258663 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:25,475 : INFO : EPOCH 1 - PROGRESS: at 12.19% examples, 261108 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:26,529 : INFO : EPOCH 1 - PROGRESS: at 16.51% examples, 263640 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:27,532 : INFO : EPOCH 1 - PROGRESS: at 20.68% examples, 265122 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:28,564 : INFO : EPOCH 1 - PROGRESS: at 24.85% examples, 264804 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:36:29,586 : INFO : EPOCH 1 - PROGRESS: at 29.02% examples, 264871 words/s, in_qsize 8, out_qsize 1
2020-12-22 15:36:30,629 : INFO : EPOCH 1 - PROGRESS: at 33.12% examples, 264354 words/s, in_qsize 8, out_qsize 0
2020-12-22 15:36:31,683 : INFO : EPOCH 1 - PROGRESS: at 37.47% examples, 264419 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

2020-12-22 15:42:24,454 : INFO : collecting all words and their counts
2020-12-22 15:42:24,455 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-22 15:42:24,510 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2020-12-22 15:42:24,559 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2020-12-22 15:42:24,614 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types



Hasil Test Parameter ke - 6
Parameter  => (4, 550, 40, 10)
Accuracy   => 0.832083208320832
Classification Report => 
               precision    recall  f1-score   support

    negative    0.84446   0.81350   0.82869      4992
    positive    0.82062   0.85061   0.83534      5007

    accuracy                        0.83208      9999
   macro avg    0.83254   0.83206   0.83202      9999
weighted avg    0.83252   0.83208   0.83202      9999


7.	 Parameter : (4, 600, 40, 10)


2020-12-22 15:42:24,671 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2020-12-22 15:42:24,728 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2020-12-22 15:42:24,784 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2020-12-22 15:42:24,837 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2020-12-22 15:42:24,900 : INFO : PROGRESS: at sentence #80000, processed 1757050 words, keeping 46072 word types
2020-12-22 15:42:24,963 : INFO : PROGRESS: at sentence #90000, processed 1976526 words, keeping 48547 word types
2020-12-22 15:42:25,018 : INFO : PROGRESS: at sentence #100000, processed 2199610 words, keeping 50850 word types
2020-12-22 15:42:25,071 : INFO : PROGRESS: at sentence #110000, processed 2412028 words, keeping 52887 word types
2020-12-22 15:42:25,125 : INFO : PROGRESS: at sentence #120000, processed 2631483 words, keepin


Training model Word2Vec ...


2020-12-22 15:42:30,467 : INFO : EPOCH 1 - PROGRESS: at 3.52% examples, 227832 words/s, in_qsize 8, out_qsize 1
2020-12-22 15:42:31,485 : INFO : EPOCH 1 - PROGRESS: at 7.46% examples, 240816 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:32,538 : INFO : EPOCH 1 - PROGRESS: at 11.05% examples, 235478 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:33,559 : INFO : EPOCH 1 - PROGRESS: at 14.55% examples, 234551 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:34,570 : INFO : EPOCH 1 - PROGRESS: at 18.46% examples, 237338 words/s, in_qsize 6, out_qsize 1
2020-12-22 15:42:35,656 : INFO : EPOCH 1 - PROGRESS: at 22.49% examples, 238502 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:36,704 : INFO : EPOCH 1 - PROGRESS: at 26.57% examples, 240466 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:37,707 : INFO : EPOCH 1 - PROGRESS: at 30.38% examples, 241630 words/s, in_qsize 7, out_qsize 0
2020-12-22 15:42:38,708 : INFO : EPOCH 1 - PROGRESS: at 34.09% examples, 241857 words/s, in_qsize 

Review 0 of 39999




Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999
Review 0 of 9999
Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of

In [None]:
print(best_score)
print(best_parameter)

[0.8283828382838284, 0.8264826482648265, 0.8294829482948295, 0.831983198319832, 0.8310831083108311, 0.832083208320832, 0.8323832383238324]
[(4, 300, 40, 10), (4, 350, 40, 10), (4, 400, 40, 10), (4, 450, 40, 10), (4, 500, 40, 10), (4, 550, 40, 10), (4, 600, 40, 10)]
