In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fast\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fast\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Fast\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


## Preprocessing

In [4]:
# Mengubah seluruh text kedalam bentuk lowercase
df['sentence'] = df['sentence'].str.lower()
 
# Menghilangkan stopwords
stop_word = set(stopwords.words('english'))
 
df['sentence'] = df['sentence'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))
 
# Melakukan split dataset
sentence = df['sentence'].values
label = df['label'].values
 
sentence_train, sentence_test, label_train, label_test = train_test_split(sentence, label, test_size=0.2, shuffle=False)
 
# Membuat tokenisasi
filt = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ' # Filter untuk menghilangkan symbols
 
tokenizer = Tokenizer(num_words=2000, oov_token="<OOV>", filters=filt)
 
tokenizer.fit_on_texts(sentence_train)
 
# Menyimpan word_index kedalam sebuah file json
word_index = tokenizer.word_index
 
with open('word_index.json', 'w') as fp:
    json.dump(word_index, fp)
 
# Membuat sequences dan melakukan padding
train_sekuens = tokenizer.texts_to_sequences(sentence_train)
test_sekuens = tokenizer.texts_to_sequences(sentence_test)
 
train_padded = pad_sequences(train_sekuens,
                             maxlen=20,
                             padding='post',
                             truncating='post')
test_padded = pad_sequences(test_sekuens,
                            maxlen=20,
                            padding='post',
                            truncating='post')

In [5]:
# Membuat model
model = tf.keras.Sequential([
    Embedding(2000, 20, input_length=20),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
 
# Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
 
# Train model
num_epochs = 30
history = model.fit(train_padded, label_train,
                    epochs=num_epochs,
                    validation_data=(test_padded, label_test),
                    verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [6]:
model.save("model.h5")

In [7]:
# Install tensorflowjs
!pip install tensorflowjs
 
# Convert model.h5 to model
!tensorflowjs_converter --input_format=keras model.h5 tfjs_model

Collecting tensorflowjs
  Using cached tensorflowjs-4.1.0-py3-none-any.whl (84 kB)
  Using cached tensorflowjs-4.0.0-py3-none-any.whl (83 kB)
  Using cached tensorflowjs-3.21.0-py3-none-any.whl (81 kB)
Collecting jax>=0.3.16
  Using cached jax-0.3.25.tar.gz (1.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting flax>=0.5.3
  Using cached flax-0.6.2-py3-none-any.whl (189 kB)
Collecting protobuf<3.20,>=3.9.2
  Using cached protobuf-3.19.6-cp39-cp39-win_amd64.whl (895 kB)
Collecting importlib_resources>=5.9.0
  Using cached importlib_resources-5.10.0-py3-none-any.whl (34 kB)
Collecting optax
  Using cached optax-0.1.4-py3-none-any.whl (154 kB)
Collecting msgpack
  Using cached msgpack-1.0.4-cp39-cp39-win_amd64.whl (62 kB)
Collecting tensorstore
  Using cached tensorstore-0.1.28-cp39-cp39-win_amd64.whl (6.5 MB)
Collecting optax
  Using cached optax-0.1.3-py3-none-any.whl (145 kB)
  Using cached optax-0.1.2-py3-none-any.whl 

2022-12-01 19:03:58.691943: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-12-01 19:03:58.692483: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
