<a href="https://colab.research.google.com/github/aloy4646/CNN-for-Sentiment-Analysis/blob/main/1120053_Aloysius_Tugas_CNN_for_SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [71]:
from nltk.corpus import stopwords
import string
import re
from os import listdir
from collections import Counter
from keras.preprocessing.text import Tokenizer
from numpy import array
from keras.utils import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

## Loading and Cleaning Reviews

In [14]:
def load_doc(filename):
  # buka file -> read only
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

def clean_doc(doc):
  tokens = doc.split()
  # menyiapkan regex untuk filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # membuang tanda baca 
  tokens = [re_punc.sub('', w) for w in tokens]
  # membuang tokens yang tidak alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # membuang stop word
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # membuang tokens yang panjangnya <= 1
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

## Define Vocabulary

In [8]:
def add_doc_to_vocab(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  # update counts
  vocab.update(tokens)

def process_docs(directory, vocab):
  for filename in listdir(directory):
    # hanya membuka cv000 sampai cv899 untuk training
    if filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    add_doc_to_vocab(path, vocab)    

In [9]:
vocab = Counter()
process_docs('drive/MyDrive/review_polarity/txt_sentoken/pos', vocab)
process_docs('drive/MyDrive/review_polarity/txt_sentoken/neg', vocab)
print(len(vocab))

44276


In [10]:
# print top words
print(vocab.most_common(50))

[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [15]:
# menghapus tokens yang kemunculannya dibawah 5
min_occurrence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(len(tokens))

25767


In [12]:
def save_list(lines, filename):
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

In [16]:
save_list(tokens, 'drive/MyDrive/review_polarity/vocab.txt')

# Train CNN with Embeding Layer

In [60]:
def clean_doc_train(doc, vocab):
  tokens = doc.split()
  # menyiapkan regex untuk filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # membuang tanda baca 
  tokens = [re_punc.sub('', w) for w in tokens]
  # membuang tokens yang tidak ada di vocab
  tokens = [w for w in tokens if w in vocab]
  tokens = ' '.join(tokens)
  return tokens

def process_docs_train(directory, vocab, is_train):
  documents = list()
  for filename in listdir(directory):
    # hanya membuka cv000 sampai cv899 untuk training
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    doc = load_doc(path)
    tokens = clean_doc_train(doc, vocab)
    documents.append(tokens)
  return documents

def load_clean_dataset(vocab, is_train):
  neg = process_docs_train('drive/MyDrive/review_polarity/txt_sentoken/neg', vocab, is_train)
  pos = process_docs_train('drive/MyDrive/review_polarity/txt_sentoken/pos', vocab, is_train)
  docs = neg + pos
  # memberi label
  labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
  return docs, labels

### Encode kata ke integer

In [24]:
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# encode kata ke integer
def encode_docs(tokenizer, max_length, docs):
  encoded = tokenizer.texts_to_sequences(docs)
  padded = pad_sequences(encoded, maxlen=max_length, padding='post')
  return padded

### Define model

In [50]:
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length=max_length))
  model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(10, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # compile
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  plot_model(model, to_file='drive/MyDrive/review_polarity/model.png', show_shapes=True)
  return model

### Load training data

In [61]:
vocab = load_doc('drive/MyDrive/review_polarity/vocab.txt')
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab, True)

### Training data

In [62]:
tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 25768


In [63]:
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)

Maximum length: 1317


In [64]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1317, 100)         2576800   
                                                                 
 conv1d_2 (Conv1D)           (None, 1310, 32)          25632     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 655, 32)          0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 20960)             0         
                                                                 
 dense_4 (Dense)             (None, 10)                209610    
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                      

In [66]:
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
57/57 - 16s - loss: 0.6828 - accuracy: 0.5783 - 16s/epoch - 288ms/step
Epoch 2/10
57/57 - 17s - loss: 0.4799 - accuracy: 0.7844 - 17s/epoch - 307ms/step
Epoch 3/10
57/57 - 18s - loss: 0.0815 - accuracy: 0.9789 - 18s/epoch - 309ms/step
Epoch 4/10
57/57 - 17s - loss: 0.0072 - accuracy: 1.0000 - 17s/epoch - 301ms/step
Epoch 5/10
57/57 - 16s - loss: 0.0020 - accuracy: 1.0000 - 16s/epoch - 288ms/step
Epoch 6/10
57/57 - 16s - loss: 0.0011 - accuracy: 1.0000 - 16s/epoch - 288ms/step
Epoch 7/10
57/57 - 16s - loss: 6.8291e-04 - accuracy: 1.0000 - 16s/epoch - 285ms/step
Epoch 8/10
57/57 - 16s - loss: 4.8170e-04 - accuracy: 1.0000 - 16s/epoch - 288ms/step
Epoch 9/10
57/57 - 17s - loss: 3.5670e-04 - accuracy: 1.0000 - 17s/epoch - 306ms/step
Epoch 10/10
57/57 - 18s - loss: 2.7855e-04 - accuracy: 1.0000 - 18s/epoch - 309ms/step


<keras.callbacks.History at 0x7f3f2b0594f0>

In [67]:
model.save('drive/MyDrive/review_polarity/model.h5')

## Evaluate the model

#### load dan encode test data

In [69]:
test_docs, ytest = load_clean_dataset(vocab, False)
Xtest = encode_docs(tokenizer, max_length, test_docs)

In [72]:
model = load_model('drive/MyDrive/review_polarity/model.h5')
_, acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %f' % (acc*100))
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Train Accuracy: 100.000000
Test Accuracy: 87.000000


## Klasifikasi review sebagai negatif atau positif

In [80]:
# def predict_sentiment(review, vocab, tokenizer, max_length, model):
#   line = clean_doc_train(review, vocab)
#   padded = encode_docs(tokenizer, max_length, [line])
#   yhat = model.predict(padded, verbose=0)
#   print('Percent pos: %f' % percent_pos)
#   percent_pos = yhat[0,0]
#   if round(percent_pos) == 0:
#     return (1-percent_pos), 'NEGATIVE'
#   return percent_pos, 'POSITIVE'

def predict_sentiment(review, vocab, tokenizer, max_length, model):
  # clean review
  line = clean_doc_train(review, vocab)
  # encode and pad review
  padded = encode_docs(tokenizer, max_length, [line])
  # predict sentiment
  yhat = model.predict(padded, verbose=0)
  # retrieve predicted percentage and label
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'
  return percent_pos, 'POSITIVE'

### Test pada positif review

In [86]:
text = 'Everyone will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Everyone will enjoy this film. I love it, recommended!]
Sentiment: NEGATIVE (95.118%)


In [87]:
text = """glory--starring matthew broderick , denzel washington , and morgan freeman--is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north during the civil war . 
broderick plays robert gould shaw , the young white officer who led the black soldiers into battle . 
shaw , the son of well-to-do abolitionists , hailed from boston high society . 
the letters he wrote home to his parents during the war are on display at harvard , and were , evidently , the inspiration for glory . 
as the film begins in 1862 , shaw is a captain in the northern forces . 
like private eriksson ( michael j . fox ) in casualties of war , shaw initially is naive and idealistic about the war--that is , until his company is attacked by enemy forces . 
shaw experiences first hand the horror and chaos of battle , witnessing mass slaughter and receiving a minor wound himself when a bullet grazes his neck . 
soon after his recovery , shaw is promoted to colonel and assigned to enlist and train blacks in the war effort . 
glory is the story not only of colonel shaw , but also of the black soldiers who laid down their lives to free their brothers from slavery . 
the film periodically jumps between shaw's point of view and the perspective of the black soldiers . 
the movie introduces us to a handful of black recruits , and we follow them from their enlistment through basic training and finally into action . 
the large black cast is uniformly outstanding , especially washington who is electrifying as a runaway slave with a big mouth . 
he is brash and pushy , always getting into trouble and always looking for a fight . 
his bitter , tough guy facade is really just a mask for his loneliness and vulnerability . 
washington provides much of the film's intensity and emotional power . 
in one heartbreaking scene , he is whipped for allegedly deserting the army . 
when he removes his shirt to receive the punishment , you cringe at the sight of his back , which is riddled with ugly scars from his days as a slave . 
it makes your blood boil . 
the humiliation of the beating is far more traumatic than the actual physical pain it brings ; a tear rolls down washington's cheek--and will probably run down your cheek as well . 
the episode becomes even more tragic when we learn that washington wasn't deserting the army at all ; he left camp to look for shoes because his feet were covered with oozing sores . 
freeman is , as usual , a strong presence , even in a small supporting role . 
he plays a grave digger who has buried more white soldiers than he cares to remember . 
he quickly becomes a leader among the black soldiers , holding the group together and serving as a liaison to the white officers . 
colonel shaw recognizes freeman's leadership ability and promotes him to sergeant major , making him the first black officer in the army . 
andre braugher makes an impressive film debut in the role of thomas searles , a free black who is one of shaw's close childhood friends . 
searles is educated and refined , like a white man , prompting washington to nickname him " snow flake . " 
the burning question is whether searles is tough enough to survive basic training and to kill in combat . 
the road from marching drills to battle action is a bumpy one for the black regiment . 
the soldiers suffer innumerable hardships , but somehow they never lose their morale . 
the army treats the black soldiers like second class citizens , subjecting them to racism and discrimination . 
they are paid only $10 a month , whereas their white counterparts earn thirteen , and , for a long time , they have to go without shoes , guns , or uniforms . 
to make matters worse , the white military hierarchy is extremely reluctant to allow the blacks into action , preferring instead to use them for manual labor . 
eventually , however , the regiment receives its boots , uniforms , rifles , and right to fight , thanks to the stubborn resolve of colonel shaw . 
shaw has absolute faith in his soldiers , and he fights tooth and nail to get them what they deserve , even if it means threatening a general with blackmail . 
broderick , in fact , is most convincing in the scenes where shaw stands up for the regiment . 
unfortunately , however , broderick's uneven performance is , in many respects , the weak link in the movie . 
in an effort to look more mature , broderick sports a mustache and a goatee , and throughout the film he slips in and out of a phony boston accent . 
he is never altogether convincing as shaw since much of the time his emotions seem forced . 
the film places too much weight on broderick's character and not enough on the black soldiers , who are more intriguing . 
glory regains lost ground with its harrowing depiction of war . 
the movie shows the devastation of war without resorting to the unnecessarily graphic gore which marred born on the fourth of july . 
glory does not try to rattle you with nauseating blood and guts . 
except for a few bullet wounds and one exploding head , the film , for the most part , leaves the gore to your imagination , which is not to say that the battle scenes in glory are timid . 
to the contrary , they are chaotic and horrifying ; it's just that director edward zwick ( the co-creator of " thirtysomething " ) films them with far more subtlety and restraint than oliver stone could ever muster . 
the key to glory is the group dynamic among the black soldiers . 
the movie depicts some of ( but not enough of ) their customs and rituals . 
in one scene , for example , the soldiers motivate themselves by singing prayers around the campfire . 
each man has a chance to relay a few words of inspiration . 
a couple of the movie's most touching moments involve young black children looking up to the black soldiers with awe , disbelief and pride . 
the regiment's greatest triumph comes when the soldiers distinguish themselves in battle , thereby earning the respect of their white peers and earning the honor of leading the climactic assault on fort wagner . 
like any war film , glory has its share of gloom and despair , but ultimately it proves to be a truly uplifting experience and an important history lesson , a valuable reminder that despite what the history books say ( or , more precisely , what they do not say ) , blacks played a critically important role in the north's victory over the south--forever changing the evolution of america . """
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [glory--starring matthew broderick , denzel washington , and morgan freeman--is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north during the civil war . 
broderick plays robert gould shaw , the young white officer who led the black soldiers into battle . 
shaw , the son of well-to-do abolitionists , hailed from boston high society . 
the letters he wrote home to his parents during the war are on display at harvard , and were , evidently , the inspiration for glory . 
as the film begins in 1862 , shaw is a captain in the northern forces . 
like private eriksson ( michael j . fox ) in casualties of war , shaw initially is naive and idealistic about the war--that is , until his company is attacked by enemy forces . 
shaw experiences first hand the horror and chaos of battle , witnessing mass slaughter and receiving a minor wound himself when a bullet grazes his neck . 
soon after his recovery , shaw is promoted to colonel a

### Test pada negatif review

In [77]:
text = 'This is a bad movie. Do not watch it. It sucks.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Percent pos: 0.040507
Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: NEGATIVE (95.949%)


In [88]:
text = """john boorman's " zardoz " is a goofy cinematic debacle so fundamentally misconceived and laughably executed that it takes on a bizarre enjoyment quality all its own . 
not since the rampant bumblings of one edward d . wood jr . has a movie been so silly and so serious at the same time . 
of course , wood's career can be explained by two things : he had no money and he had no talent . 
boorman , on the other hand , cannot court such excuses to explain " zardoz " ( or his follow-up film , the equally awful " exorcist ii : the heretic " ) . 
boorman obviously had a sizable budget , a matinee idol movie star ( sean connery ) in the lead role , and although you wouldn't know it from this film , boorman does indeed have talent . 
this is the man who made the slick modern masterpiece " deliverance " ( 1972 ) , as well as the autobiographical world war ii drama " hope and glory " ( 1987 ) , the slightly over-conceived arthurian epic " excalibur " ( 1981 ) and the father-son jungle adventure " the emerald forest " ( 1985 ) . 
his films all show that boorman is never lacking in imagination , but sometimes that comes at the cost of coherence and taste . 
if boorman is anything , he's ambitious , and when he succeeds , it's in grand fashion . 
unfortunately , the bigger they are , the harder they fall , and when boorman falls , the resounding impact can be heard for miles around . 
 " zardoz " is meant to takes its place among the grandest of mystical movies , an obsession of boorman's . 
his screenplay tries to elicit the same mythological connotations of the arthurian legends or even " the wizard of oz , " a book which figures into the movie's plot . 
but , despite all this reaching , the resulting movie is more unintentionally funny than intentionally enigmatical or compelling . 
the events take place in the distant year 2293 , but there is little of the typical futuristic movie-ness to be found . 
in fact , things seems to have moved backwards , with people riding horses , shooting old-style guns , and living in large victorian mansions . 
it's more middle ages than space age . 
the world of " zardoz " is divided into two distinct hemispheres : the outlands , where all the poor , pathetic people live , and the vortex , where a select group of wealthy intellectuals live in comfort and everlasting life . 
these immortals never grow old , they never engage in sexual activity , they possess psychological powers , and they live in a sort of quasi-utopian marxist society where everyone is equal , and everyone contributes equally to the society . 
however , if one breaks the rules , that person is punished by being aged so many years . 
if someone breaks the rules enough , he or she is aged to the point of senility , and imprisoned to an eternal existence in a geriatric home with others aged criminals . 
one of the immortals , arthur frayn ( niall buggy ) , a squirmy man with a mustache and goatee tattooed on his face , is charged with keeping order in the outlands and forcing the residents to farm so the immortals can be fed . 
like " the wizard of oz , " he adopts a god-like status among the people by flying in to their part of the world in a giant stone carved like a menacing head . 
 ( this flying head is one of the movie's opening images , and it's a dead giveaway of the lunacy to come . ) 
calling himself zardoz , frayn gathers a bunch of outlanders and makes them into a group called the exterminators , whose purpose is to kill most of the other outlanders so they can't procreate and take up more resources . 
from inside his giant , stone head , zardoz bellows seriously laugh-inducting statements like , " the gun is good . 
the penis is evil . " 
that line alone is worth the movie's cult following . 
one day , an exterminator named zed ( sean connery ) , sneaks into zardoz's flying stone , pushes frayn out , and goes back to the vortex . 
once there , the immortals label him a " brutal " and study him like a lab rat , taking great , perverse care in exploring his sexuality , which is a mystery to them . 
they seem especially interested in his ability to gain an erection , and there is one downright hilarious sequence where a bunch of scantily-clad female scientists show zed erotic footage on a video screen in an attempt to determine what gets him worked up . 
i say " hilarious " because that is exactly what " zardoz " is . 
it is obvious that boorman did not intend it to be so ; he made this film with the straightest of faces , although i have a hard time believing that as production moved forward , he didn't get even the slightest inkling of how patently ridiculous it was becoming . 
just looking at connery is enough to give one the giggles - he spends most of the film running around in a red loin cloth that resembles a diaper , a mane of hair braided halfway down his back , a wyatt earp-style handlebar mustache , and a pair of thigh-high patent leather boots that would look more appropriate on a cheap hollywood hooker . 
boorman made the film right after the critical and financial success of " deliverance , " which is the only reason i can imagine a studio would green-light this effort . 
he attracted some rich talent on both sides of the camera , including cinematographer geoffrey unsworth ( " 2001 " ) , whose striking visuals are about the only good thing in " zardoz " besides the inadvertent humor . 
sean connery had made his last james bond film in 1971 , and perhaps he was looking for a change in pace . 
he got exactly that in " zardoz , " and it's a wonder it didn't end his career . 
i'm sure boorman intended for this movie to make some grand statements . 
is it a treatise about the infallibility of eternal life ? 
is it a condemnation of those who consider growing old to be a bad thing ? 
or is it a social statement , something about the inherent negativity of class distinctions and the violence it creates ? 
karl marx might like it if he were more like timothy leary . 
come to think of it , maybe boorman made it as an extended lsd trip . 
people high on illicit substances are the only ones i can imagine enjoying this asinine silliness as anything more than a completely unintentional comedy . 
"""
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [john boorman's " zardoz " is a goofy cinematic debacle so fundamentally misconceived and laughably executed that it takes on a bizarre enjoyment quality all its own . 
not since the rampant bumblings of one edward d . wood jr . has a movie been so silly and so serious at the same time . 
of course , wood's career can be explained by two things : he had no money and he had no talent . 
boorman , on the other hand , cannot court such excuses to explain " zardoz " ( or his follow-up film , the equally awful " exorcist ii : the heretic " ) . 
boorman obviously had a sizable budget , a matinee idol movie star ( sean connery ) in the lead role , and although you wouldn't know it from this film , boorman does indeed have talent . 
this is the man who made the slick modern masterpiece " deliverance " ( 1972 ) , as well as the autobiographical world war ii drama " hope and glory " ( 1987 ) , the slightly over-conceived arthurian epic " excalibur " ( 1981 ) and the father-son jungle adv