In [18]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import json

In [19]:
import re

def remove_tags(string):
    removelist = ""  # Add any characters you'd like to keep
    # Remove HTML tags
    result = re.sub(r'<[^>]+>', '', string)
    # Remove URLs
    result = re.sub(r'https?://\S+', '', result)
    # Remove non-alphanumeric characters (except for those in the removelist)
    result = re.sub(r'[^a-zA-Z0-9' + removelist + r'\s]', ' ', result)
    # Convert to lowercase
    result = result.lower()
    return result

In [20]:
df = pd.read_csv("IMDB Dataset.csv")
df['review'] = df['review'].apply(remove_tags)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


In [21]:
stop_word = set(stopwords.words('english'))
 
df['review'] = df['review'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))
 
# Melakukan split dataset
review = df['review'].values
sentiment = df['sentiment'].values
 
review_train, review_test, sentiment_train, sentiment_test = train_test_split(review, sentiment, test_size=0.1, shuffle=False)
 
# Membuat tokenisasi
filt = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ' # Filter untuk menghilangkan symbols

vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", filters=filt)
 
tokenizer.fit_on_texts(review_train)

In [22]:
import numpy as np

# Hitung panjang setiap review di data latih
review_lengths = [len(x) for x in df['review'].apply(lambda x: x.split())]

# Cek persentil 90 atau 95
p90 = int(np.percentile(review_lengths, 90))
p95 = int(np.percentile(review_lengths, 95))

print(f"Mean: {np.mean(review_lengths)}")
print(f"90% review length: {p90}")
print(f"95% review length: {p95}")

Mean: 119.5824
90% review length: 236
95% review length: 309


In [23]:

# Menyimpan word_index kedalam sebuah file json
filtered_word_index = {word: index for word, index in tokenizer.word_index.items() if index < vocab_size}
 
with open('word_index.json', 'w') as fp:
    json.dump(filtered_word_index, fp)
 
# Membuat sequences dan melakukan padding
train_sekuens = tokenizer.texts_to_sequences(review_train)
test_sekuens = tokenizer.texts_to_sequences(review_test)
 
train_padded = pad_sequences(train_sekuens,
                             maxlen=p90,
                             padding='pre', truncating='pre')
test_padded = pad_sequences(test_sekuens,
                            maxlen=p90,
                            padding='pre', truncating='pre')

In [24]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

In [25]:
tf.keras.backend.clear_session()
# Membuat model
model = tf.keras.Sequential([
    Embedding(vocab_size, 64, input_length=p90),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    
    Dense(1, activation='sigmoid')

])
 
# Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
 
# Train model
num_epochs = 30
history = model.fit(train_padded, sentiment_train,
                    epochs=num_epochs,
                    validation_data=(test_padded, sentiment_test),
                    verbose=1,
                    callbacks=[early_stop])


Epoch 1/30




[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.7873 - loss: 0.4294 - val_accuracy: 0.8852 - val_loss: 0.2819
Epoch 2/30
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.8938 - loss: 0.2638 - val_accuracy: 0.8842 - val_loss: 0.2751
Epoch 3/30
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - accuracy: 0.9083 - loss: 0.2350 - val_accuracy: 0.8634 - val_loss: 0.3218
Epoch 4/30
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.9134 - loss: 0.2203 - val_accuracy: 0.8958 - val_loss: 0.2593
Epoch 5/30
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - accuracy: 0.9205 - loss: 0.2045 - val_accuracy: 0.8482 - val_loss: 0.3977
Epoch 6/30
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.9247 - loss: 0.1945 - val_accuracy: 0.8676 - val_loss: 0.3407
Epoch 7/30
[1m14

In [26]:
model.evaluate(test_padded, sentiment_test)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8958 - loss: 0.2593


[0.2593337893486023, 0.895799994468689]

In [27]:
tf.saved_model.save(model, 'tf_model')

INFO:tensorflow:Assets written to: tf_model\assets


INFO:tensorflow:Assets written to: tf_model\assets


In [28]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=p90, padding='post', truncating='post')
    prediction = model.predict(padded)
    return f"positive (confidence :{prediction[0][0]:.4f})" if prediction[0][0] >= 0.5 else f"negative (confidence :{1 - prediction[0][0]:.4f})" 

In [31]:
predict_sentiment("""let's get this out of the way first: I REALLY liked Monster! But, I wouldn't watch it again...

Monster is a down-to-Earth psychological crime drama. At first it might appear to have supernatural elements, but I won't spoil it for you. After watching the entire thing, I would say Monster would've been great as a live-action TV series, which couldn't be said for any other anime I watched so far (and I watched quite a lot). It has everything a great series needs, a cast of characters that the audience can care about, action-packed scenes to liven up the pretty in-depth crime drama, great character development, a good story full of twists and turn that will keep you coming back for more and an astonishing and realistic art-style to wrap it all together. During it's 74 episodes it brings up many hard questions about the human psyche, morality and human connections, relationships. How far are we willing to go to accomplish our goals? How much of your humanity are you willing to trade in for them? What is "humanity" anyway? Some of the episodes are frighteningly realistic in describing the human condition and it doesn't back down from touching really hard social and historical taboos either. No, it grabs you by the hair and slams your face in them saying "Look! That's what you are!". I can safely say Monster was one of the most unique and thought-provoking experiences I've ever had.

But, (and yes, here comes the "but") Monster is anything but perfect. While it's action-packed and suspenseful story would stand great on it's own, it's sadly spread too thin and too long. The story needlessly drags on for 74 episodes and the ending feels more like a coup de grace than closure. The cast of main characters is huge, and while they are really well made and fleshed out, we are continuously introduced to a slew of new side-characters that have barely any relevance to the main story (if at all). The writers regularly go off on tangents just to demonstrate a small plot point or tidbits of (mostly irrelevant) character backstory, bringing in and taking out characters on a whim after they "served their purpose". In the end, they were seemingly just struggling to give enough individual screen-time to their monstrously bloated cast (pun intended), and it only makes the audience lose interest in them and lose count on who is who why they are even there.

The story is mostly delivered in (sometimes painfully dragged out) exposition. The story lurches forward in needlessly detailed investigation sequences flooded with meaningless trivia and extra character backstory that have little to no bearing on the main story itself. The story regularly branches off into dead-ends and meaningless side-plots that fill entire episodes yet don't bring anything new or interesting to the table. The sheer amount of dialogue and narration in Monster would fill entire volumes of books. Even the exposition itself is riddled by double-takes, needlessly repeated "remembering" segments and a ton of redundant, rephrased information. You can seriously skip entire dozens of episodes and still understand everything since the characters and the exposition keep repeating themselves over and over. The whole series could've been distilled down to a neat 30-35 episodes without losing any of the story.

My third (entirely personal) beef is with the setting. Unlike most anime, the entire story of Monster takes place in 80's and 90's Europe (mostly Germany and the former Czechoslovakia). See, I was born and still living in Europe, I lived in the time and place the show takes place. Monster being a work of fiction, I chalked up most of the factual, cultural and historical errors to "writer's freedom" and "suspension of disbelief" and such. While the creators of the anime obviously did their homework and got most of the big things right, there were some things that bugged me more than they should've. Little, insignificant things that most people from other parts of the world would miss, were just screaming at me from the screen. Getting used to the obviously Japanese mannerisms, phrases and behaviors forced upon the allegedly European characters is one thing. But small details like choice of words, type of foods/drinks, fashion, architecture or even music in some places were just flat our wrong and felt so out of place that it shoved me right out of the immersion. I know it sounds lame, but since the creators obviously tried to recreate the setting realistically, I just couldn't help it.

I know I spent most of this review pandering on what's wrong with Monster, but the truth is, I really liked it, and I stand by my score of 8/10. It's really worth watching for everyone who desires something other than the run-of-mill anime, something unique. If you can overlook the droll exposition and sometimes aimlessly branching and dragged out storytelling, you'll find a really suspenseful and interesting story of crime and punishment, dark secrets, interesting characters, huge plot twists, thrilling psychological expeditions into the human mind and soul and much more.""")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


'positive (confidence :0.9680)'

In [30]:
model.build(input_shape=(None, 236))
model.save('model_fixed.h5')

