In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Importing library
import numpy as np
import pandas as pd

# BeautifulSoup = hapus tag html
from bs4 import BeautifulSoup 
import re # regular expressions (regex)

# natural language tool kits
from nltk.corpus import stopwords
import nltk

In [4]:
# path dataset
data_path = r'/content/drive/MyDrive/Text Dataset/IMDB Dataset.csv'

# Read data from files
dataset = pd.read_csv(data_path, header=0, sep = ',')

# splitting data
train = dataset[0:39999]
test = dataset[40000:49999]

In [5]:
# proprocessing data teks
def review_wordlist(review, remove_stopwords=False):
    
    # hapus tag html
    review_text = BeautifulSoup(review).get_text()
    
    # hapus simbol
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # konversi ke huruf kecil dan dipisah perkata
    words = review_text.lower().split()

    # menghapus stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [6]:
# download file punctuation and stpwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # melakukan tokenize dengan nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    # mengisi array sentences dengan masing - masing review
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # list dari list
    return sentences

In [9]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [11]:
print("List of lists. Let's confirm: ", type(sentences), " of ", type(sentences[0]))
print(sentences[10])
print(len(sentences))

List of lists. Let's confirm:  <class 'list'>  of  <class 'list'>
['a', 'wonderful', 'little', 'production']
428144


In [34]:
# membuat model dan mengatur nilai parameter
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 0.001 # (0.001) Downsample setting for frequent words

In [31]:


# Inisialisasi model
from gensim.models import word2vec
# model = word2vec.Word2Vec(workers = num_workers, 
#                           size = num_features, 
#                           min_count = min_word_count, 
#                           window = context, 
#                           sg = 1, # sg = 1 (skipgram), default/0 adalah cbow
#                           sample = downsampling)
model = word2vec.Word2Vec(size = 300)
model.build_vocab(sentences)

2021-01-04 03:37:07,972 : INFO : collecting all words and their counts
2021-01-04 03:37:07,975 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-04 03:37:08,036 : INFO : PROGRESS: at sentence #10000, processed 224307 words, keeping 17319 word types
2021-01-04 03:37:08,092 : INFO : PROGRESS: at sentence #20000, processed 434605 words, keeping 24175 word types
2021-01-04 03:37:08,157 : INFO : PROGRESS: at sentence #30000, processed 655533 words, keeping 29419 word types
2021-01-04 03:37:08,220 : INFO : PROGRESS: at sentence #40000, processed 877306 words, keeping 33897 word types
2021-01-04 03:37:08,278 : INFO : PROGRESS: at sentence #50000, processed 1100000 words, keeping 37667 word types
2021-01-04 03:37:08,343 : INFO : PROGRESS: at sentence #60000, processed 1324580 words, keeping 40923 word types
2021-01-04 03:37:08,404 : INFO : PROGRESS: at sentence #70000, processed 1543432 words, keeping 43631 word types
2021-01-04 03:37:08,468 : INFO : PROGRESS: 

In [32]:
print("Training model....")
# model.train(sentences = sentences, total_examples = len(sentences), epochs = model.iter)
model.train(sentences = sentences, total_examples = len(sentences), epochs = 2)

2021-01-04 03:37:30,017 : INFO : training model with 3 workers on 35644 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5


Training model....


2021-01-04 03:37:31,048 : INFO : EPOCH 1 - PROGRESS: at 4.39% examples, 299940 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:32,053 : INFO : EPOCH 1 - PROGRESS: at 8.83% examples, 304504 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:33,082 : INFO : EPOCH 1 - PROGRESS: at 13.33% examples, 305796 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:34,094 : INFO : EPOCH 1 - PROGRESS: at 17.80% examples, 305912 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:35,122 : INFO : EPOCH 1 - PROGRESS: at 22.38% examples, 306529 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:36,141 : INFO : EPOCH 1 - PROGRESS: at 26.99% examples, 307230 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:37,153 : INFO : EPOCH 1 - PROGRESS: at 31.64% examples, 309150 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:38,158 : INFO : EPOCH 1 - PROGRESS: at 36.37% examples, 310923 words/s, in_qsize 5, out_qsize 0
2021-01-04 03:37:39,174 : INFO : EPOCH 1 - PROGRESS: at 41.03% examples, 311153 words/s, in_qsize 

(13840698, 18707968)

In [14]:
# menyimpan model untuk kebutuhan selanjutnya
# model_name = "300features_40minwords_10context"
# model.save(model_name)

In [15]:
# test : mencetak kata yang tidak ada hubunganya 
model.wv.doesnt_match("man woman dog child kitchen".split())

2021-01-04 03:29:14,718 : INFO : precomputing L2-norms of word weight vectors
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [16]:
model.wv.doesnt_match("france england germany berlin".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'berlin'

In [17]:
# mencetak kata yang paling mirip di dalam model
model.wv.most_similar("man")

[('woman', 0.7549165487289429),
 ('boy', 0.708862841129303),
 ('guy', 0.7053457498550415),
 ('mans', 0.6648308634757996),
 ('monk', 0.650288999080658),
 ('lady', 0.6396517753601074),
 ('thug', 0.6378631591796875),
 ('girl', 0.6358193159103394),
 ('servant', 0.6319907903671265),
 ('pervert', 0.6261723041534424)]

In [18]:
model.wv.most_similar("awful")

[('terrible', 0.87895667552948),
 ('dreadful', 0.8556897044181824),
 ('horrible', 0.8545076251029968),
 ('atrocious', 0.8503308296203613),
 ('appalling', 0.8303557634353638),
 ('horrendous', 0.8302174806594849),
 ('abysmal', 0.7890251874923706),
 ('horrid', 0.7852514982223511),
 ('laughable', 0.7789475321769714),
 ('bad', 0.7666476368904114)]

In [19]:
# jumlah kata dalam vocab dari dataset 
model.wv.syn0.shape

  


(11183, 100)

In [20]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [21]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [35]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 39999


  del sys.path[0]


Review 1000 of 39999
Review 2000 of 39999
Review 3000 of 39999
Review 4000 of 39999
Review 5000 of 39999
Review 6000 of 39999
Review 7000 of 39999
Review 8000 of 39999
Review 9000 of 39999
Review 10000 of 39999
Review 11000 of 39999
Review 12000 of 39999
Review 13000 of 39999
Review 14000 of 39999
Review 15000 of 39999
Review 16000 of 39999
Review 17000 of 39999
Review 18000 of 39999
Review 19000 of 39999
Review 20000 of 39999
Review 21000 of 39999
Review 22000 of 39999
Review 23000 of 39999
Review 24000 of 39999
Review 25000 of 39999
Review 26000 of 39999
Review 27000 of 39999
Review 28000 of 39999
Review 29000 of 39999
Review 30000 of 39999
Review 31000 of 39999
Review 32000 of 39999
Review 33000 of 39999
Review 34000 of 39999
Review 35000 of 39999
Review 36000 of 39999
Review 37000 of 39999
Review 38000 of 39999
Review 39000 of 39999


In [36]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 9999


  del sys.path[0]


Review 1000 of 9999
Review 2000 of 9999
Review 3000 of 9999
Review 4000 of 9999
Review 5000 of 9999
Review 6000 of 9999
Review 7000 of 9999
Review 8000 of 9999
Review 9000 of 9999


In [24]:
print(trainDataVecs.shape)
print(testDataVecs.shape)

(39999, 100)
(9999, 100)


In [25]:
print(trainDataVecs[10])

[ 0.00823498  0.03644501  0.03218948  0.22181776  0.02433273  0.25639123
  0.12238596  0.04065422  0.01177646 -0.01663784 -0.1770446   0.16924544
  0.12316813  0.0390509   0.07543782  0.15042903  0.34172463 -0.08437014
  0.06878469 -0.07333653 -0.12533984 -0.2788143   0.09931821  0.08015273
  0.07894555 -0.02600707  0.00558659  0.00903116 -0.00805558  0.13014889
  0.21544307 -0.05873488  0.00982271  0.12977971  0.0286414   0.02313107
 -0.14247485  0.31120658  0.04083855  0.03913445 -0.14548099 -0.15282984
 -0.22545317  0.14096549 -0.05313232  0.0425006  -0.06810469  0.0937503
  0.03497593 -0.0038024   0.1857445  -0.05436939 -0.06605195  0.0439129
  0.07789018 -0.13019702 -0.08460937 -0.05584453 -0.04149645 -0.12297469
 -0.08209901 -0.2558594  -0.26634115  0.2636817  -0.00911715 -0.00616273
  0.1237504   0.02108551  0.12432893 -0.35028625  0.13643944 -0.01332242
  0.09356418 -0.05550663 -0.11320368 -0.09755907 -0.15731546  0.06794147
 -0.08227494  0.18442886 -0.04363215 -0.02113006 -0.0

In [37]:
# Fitting a random forest classifier to the training data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])

# Prediksi nilai sentiment untuk test data 
predicted = forest.predict(testDataVecs)

# akurasi
accuracy = accuracy_score(test["sentiment"], predicted)
report = classification_report(test["sentiment"], predicted, digits = 5)

Fitting random forest to training data....


In [38]:
print(report)

              precision    recall  f1-score   support

    negative    0.80953   0.78586   0.79752      4992
    positive    0.79255   0.81566   0.80394      5007

    accuracy                        0.80078      9999
   macro avg    0.80104   0.80076   0.80073      9999
weighted avg    0.80103   0.80078   0.80073      9999



In [27]:
# Predicting the sentiment values for test data and saving the results in a csv file 
# result = forest.predict(testDataVecs)
# output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
# output.to_csv( "output.csv", index=False, quoting=3 )