In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import requests

from sklearn.model_selection import train_test_split
from pathlib import Path
from zipfile import ZipFile

In [4]:
MODEL_ID = "bert"

MODEL_DIR = Path("./models")
LOG_DIR = Path("./log")
DOWNLOAD_DIR = Path("./downloads")

TRAIN_PATH = Path("./data/train.csv")
TEST_PATH = Path("./data/test.csv")

EMBEDDING_DIMS = 300

SEED = 1
BATCH_SIZE = 64

In [7]:
MODEL_DIR.mkdir(parents = True, exist_ok = True)
LOG_DIR.mkdir(parents = True, exist_ok = True)
DOWNLOAD_DIR.mkdir(parents = True, exist_ok = True)

In [5]:
checkpoint_path = MODEL_DIR / (MODEL_ID + ".{epoch:03d}-{val_accuracy:.4f}.h5")
log_path = (LOG_DIR / MODEL_ID).with_suffix(".csv")

print(checkpoint_path)
print(log_path)

models/bert.{epoch:03d}-{val_accuracy:.4f}.h5
log/bert.csv


In [6]:
# Set global seed for reproducible results
tf.random.set_seed(SEED)

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Preprocessing

In [3]:
df = pd.read_csv(TRAIN_PATH)
df.head()

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1


In [28]:
df[df['review'] == 'Good quality']

3.8181818181818183

In [10]:
df.shape

(146811, 3)

In [33]:
review_rating = dict()

for review, group in df[df.duplicated(subset=['review'], keep = False)].groupby("review"):
    review_rating[review] = group["rating"].mean()

review_rating

{'  Excellent product quality': 4.666666666666667,
 '  Free shipping product price good good good seller Response': 3.0,
 '  Good product quality': 3.3333333333333335,
 '  Product quality standards.': 2.0,
 ' Acceptable price': 2.0,
 ' Accommodating seller': 3.0,
 ' Accommodating seller Good quality': 3.0,
 ' Accommodating seller Item shipped quickly': 3.0,
 ' Accommodating seller Well-packaged': 3.0,
 ' Accommodating seller Well-packaged Item shipped quickly': 3.0,
 ' Are not worth the money': 1.0,
 ' Awesome awesome merchandise quality merchandise quality': 4.464788732394366,
 ' Awesome awesome merchandise quality merchandise quality merchandise awesome awesome quality goods quality': 4.0,
 ' Awesome awesome merchandise quality merchandise quality merchandise awesome quality': 4.5,
 ' Awesome awesome product quality CP CP value of the value of awesome awesome service': 4.5,
 ' Awesome awesome quality goods CP value awesome service': 4.457831325301205,
 ' Awesome awesome quality goods

In [16]:
def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i) < 128)

def process_text(text):
    text = remove_non_ascii(text)                # remove non-ascii
    text = text.lower()                          # lowercase
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)   # remove all non-alphanum chars
    text = text.strip()                          # remove trailing whitespaces
    text = re.sub(r'\s+', ' ', text)             # remove multiple whitespaces
    return text

In [17]:
def remove_stopwords(text, stop_words):
    return list(filter(lambda word: word not in stop_words, text))

In [18]:
def get_wordnet_pos(word):
    # Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

def lemmatize_tokens(tokens, lemmatizer):
    lemmatized = []
    for token in tokens:
        lemmatized.append(lemmatizer.lemmatize(token, get_wordnet_pos(token)))
    return " ".join(lemmatized)

In [20]:
def process_column(col):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    col = col.apply(lambda x: process_text(x))
    col = col.apply(lambda x: nltk.tokenize.word_tokenize(x))
    col = col.apply(lambda x: remove_stopwords(x, stop_words))
    col = col.apply(lambda x: lemmatize_tokens(x, wordnet_lemmatizer))

    return col

if not CLEAN_PATH.exists():
    df['review'] = process_column(df['review'])
    df = df.dropna()
    df = df[df['review'].map(lambda x: len(str(x)) > 3)]
    df.to_csv(CLEAN_PATH, index = False)
    print("Output to {}".format(CLEAN_PATH))

Output to data/clean.csv


In [21]:
df.head()

Unnamed: 0,review_id,review,rating
0,0,ga disappointed neat product meletot hilsnyaa ...,1
1,1,rdtanya replace broken glass broken chargernya,1
2,2,nyesel bngt dsni shopping antecedent photo mes...,1
3,3,sent light blue suit good ga want refund,1
4,4,pendant come dent scratch surface coat look li...,1


In [22]:
df.shape

(146624, 3)

## Preparing Data

In [23]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['review'])
vocab_size = len(tokenizer.word_index) + 1

print("Top 10 Words")
print(list(tokenizer.word_index.items())[:10])
print()
print("Bottom 10 Words")
print(list(tokenizer.word_index.items())[-10:])

Top 10 Words
[('good', 1), ('product', 2), ('quality', 3), ('delivery', 4), ('seller', 5), ('price', 6), ('speed', 7), ('excellent', 8), ('awesome', 9), ('fast', 10)]

Bottom 10 Words
[('kemvaliannya', 66587), ('paketanmakasih', 66588), ('kaklancar', 66589), ('jualannyasemakin', 66590), ('nyaharganyas', 66591), ('dikntong', 66592), ('wid19', 66593), ('brosny', 66594), ('supercuteee', 66595), ('baguuuuuuuuuuuus', 66596)]


In [24]:
sequences = tokenizer.texts_to_sequences(df['review'])
max_seq_len = 0
for seq in sequences:
    if len(seq) > max_seq_len:
        max_seq_len = len(seq)

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = max_seq_len, padding='post')

print(max_seq_len)
print(padded_sequences.shape)

174
(146624, 174)


In [25]:
df['zero_idx_rating'] = df['rating'] - 1
one_hot_ratings = tf.keras.utils.to_categorical(df['zero_idx_rating'], num_classes = 5)
one_hot_ratings.shape

(146624, 5)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    one_hot_ratings,
    test_size = 0.2,
    random_state = SEED
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(117299, 174) (29325, 174)
(117299, 5) (29325, 5)


In [27]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)) \
                        .cache() \
                        .shuffle(BATCH_SIZE * 40, seed = SEED) \
                        .repeat() \
                        .batch(BATCH_SIZE) \
                        .prefetch(tf.data.experimental.AUTOTUNE)

In [28]:
validation_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)) \
                        .batch(BATCH_SIZE) \
                        .prefetch(tf.data.experimental.AUTOTUNE)

## Training Model

In [30]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIMS, input_length = max_seq_len))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_DIMS)))
model.add(tf.keras.layers.Dense(EMBEDDING_DIMS, activation = 'relu'))
model.add(tf.keras.layers.Dense(5, activation='softmax'))

model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.CategoricalCrossentropy(),
    metrics = ["accuracy"]
)

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 174, 300)          19979100  
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1442400   
_________________________________________________________________
dense (Dense)                (None, 300)               180300    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1505      
Total params: 21,603,305
Trainable params: 1,624,205
Non-trainable params: 19,979,100
_________________________________________________________________


In [32]:
history = model.fit(
    train_ds,
    epochs = 50,
    steps_per_epoch = len(X_train) // BATCH_SIZE,
    validation_data = validation_ds,
    validation_steps = len(X_test) // BATCH_SIZE,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            patience = 3,
            restore_best_weights = True,
            verbose = 1,
            monitor = "val_accuracy",
            mode = "max"
        ),
        tf.keras.callbacks.CSVLogger(str(log_path), append = True),
        tf.keras.callbacks.ModelCheckpoint(
            str(checkpoint_path),
            monitor = "val_accuracy",
            mode = "max",
            save_best_only = True
        )
    ]
)

Epoch 1/10
Epoch 2/10
  29/1832 [..............................] - ETA: 3:47 - loss: 1.1267 - accuracy: 0.4467

KeyboardInterrupt: ignored

In [33]:
acc = history.history["accuracy"]
loss = history.history["loss"]
validation_acc = history.history["val_accuracy"]
validation_loss = history.history["val_loss"]

epochs_range = range(len(history.history["accuracy"]))

plt.figure(figsize = (8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label = "Training Accuracy")
plt.plot(epochs_range, validation_acc, label = "Validation Accuracy")
plt.legend(loc = "lower right")
plt.title("Training and Validation Accuracy")

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label = "Training Loss")
plt.plot(epochs_range, validation_loss, label = "Validation Loss")
plt.legend(loc = "lower right")
plt.title("Training and Validation Loss")
plt.show()

NameError: ignored

In [35]:
test_df = pd.read_csv(TEST_PATH)
test_df['review'] = process_column(test_df['review'])
test_df.head()

Unnamed: 0,review_id,review
0,1,great danger cool motif cantik2 jg model deliv...
1,2,one shade dont fit well
2,3,comfortable
3,4,fast delivery product expiry dec 2022 product ...
4,5,sooooo cute like play glitter well browsing ph...


In [38]:
test_sequences = tokenizer.texts_to_sequences(test_df['review'])
test_padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen = max_seq_len, padding='post')
test_padded_sequences

array([[  77,  916,  123, ...,    0,    0,    0],
       [  50, 1566,  549, ...,    0,    0,    0],
       [ 201,    0,    0, ...,    0,    0,    0],
       ...,
       [   2,    3,    8, ...,    0,    0,    0],
       [ 596,   29,   20, ...,    0,    0,    0],
       [ 862,   13,  972, ...,    0,    0,    0]], dtype=int32)

In [44]:
predicted_ratings = model.predict(test_padded_sequences)
predicted_ratings_cat = np.argmax(predicted_ratings, axis = 1) + 1
predicted_ratings_cat

array([3, 3, 4, ..., 4, 4, 4])

In [45]:
output_df = test_df.copy()
output_df['rating'] = predicted_ratings_cat
output_df.head()

Unnamed: 0,review_id,review,rating
0,1,great danger cool motif cantik2 jg model deliv...,3
1,2,one shade dont fit well,3
2,3,comfortable,4
3,4,fast delivery product expiry dec 2022 product ...,4
4,5,sooooo cute like play glitter well browsing ph...,4


In [51]:
output_df = output_df.drop('review', axis = 1)
output_df.to_csv("./data/predictions.csv", index = False)