# Food & Drink Description Generator in Bahasa using LSTM and Fine-tuned GPT2
By: Rizky Adi

## Preparation

### Import Library

In [70]:
import sys
import nltk
import re
import string
import pickle
import pandas as pd
import numpy as np

from keras.utils import pad_sequences, np_utils
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Dropout, Embedding
from keras.optimizers import Adam
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import GPT2TokenizerFast, AutoModelForCausalLM, GPT2LMHeadModel, TrainingArguments, Trainer, set_seed

### Set Transformers Seed

In [4]:
set_seed(99)

## Datasets
In this experiment the datasets that will be used is [Indonesia food delivery Gofood product list](https://www.kaggle.com/datasets/ariqsyahalam/indonesia-food-delivery-gofood-product-list) by Reyhan Ariq Syahalam

### Load Dataset

In [5]:
df = pd.read_csv('datasets/gofood_dataset.csv')
df.head()

Unnamed: 0,merchant_name,merchant_area,category,display,product,price,discount_price,isDiscount,description
0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Hot Almara Kopi (kopi Susu Gula Aren),20000.0,,0,Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...
1,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Ice Almara Kopi (kopi Susu Gula Aren),22000.0,,0,Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...
2,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Hot Millsis,20000.0,,0,Sajian Susu Coklat Milo Dengan Racikan Khas 3 ...
3,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Ice Millsis,20000.0,,0,Sajian Susu Coklat Milo Dengan Racikan Khas 3 ...
4,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Hot Millbro,22000.0,,0,Sajian Susu Coklat Milo Plus Espresso Dengan R...


In [6]:
df = df.description
df.head()

0    Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...
1    Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...
2    Sajian Susu Coklat Milo Dengan Racikan Khas 3 ...
3    Sajian Susu Coklat Milo Dengan Racikan Khas 3 ...
4    Sajian Susu Coklat Milo Plus Espresso Dengan R...
Name: description, dtype: object

### Drop na value

In [7]:
df.isna().sum()

23475

In [8]:
df = df.dropna()
df.isna().sum()

0

## LSTM

### Preprocessing

#### Cleaning Data

In [9]:
def clean(data):
    cleaned_data = []
    unique_words = []

    for description in data:
        description = re.sub('[^a-zA-Z]', ' ', description)
        description = re.sub(r'\b\w{0,1}\b', ' ', description)
        description = description.lower().strip()
        description = description.split()
        cleaned_data.append(description)
        unique = list(set(description))
        unique_words.extend(unique)

    
    unique_words = set(unique_words)

    return cleaned_data, unique_words, len(unique_words)

In [10]:
cleaned_data, unique_words, len_unique_words = clean(df)

In [11]:
print(f'Cleaned data example:\n{cleaned_data[0]}\n')
print(f'Number of unique words: {len_unique_words}')

Cleaned data example:
['sajian', 'kopi', 'susu', 'gula', 'aren', 'yang', 'berbeda', 'dari', 'khalayak', 'umum']

Number of unique words: 6073


#### Building Word to Index and Index to Word Dictionary

In [13]:
def word_and_index_dict(unique_words):
    word_to_idx = {}
    idx_to_word = {}
    for i, word in enumerate(unique_words):
        word_to_idx[word] = i
        idx_to_word[i] = word
    return word_to_idx, idx_to_word

In [14]:
word_to_idx, idx_to_word = word_and_index_dict(unique_words)

In [None]:
with open('word_to_idx.pkl', 'wb') as f:
    pickle.dump(word_to_idx, f)

with open('idx_to_word.pkl', 'wb') as f:
    pickle.dump(idx_to_word, f)

#### Building Input Sequences

In [15]:
def prepare_corpus(data, word_to_idx):
    sequences = []
    for line in data:
        tokens = line
        for i in range(1, len(tokens)):
            i_gram_sequence = tokens[:i+1]
            i_gram_sequence_ids = []
            
            for j, token in enumerate(i_gram_sequence):
                i_gram_sequence_ids.append(word_to_idx[token])
                
            sequences.append(i_gram_sequence_ids)
    return sequences

In [16]:
sequences = prepare_corpus(cleaned_data, word_to_idx)

In [17]:
max_sequence_len = max([len(x) for x in sequences])
max_sequence_len

40

#### Creating x and y Input

In [18]:
sequences = np.array(pad_sequences(sequences, maxlen = max_sequence_len, padding = 'pre'))
x = sequences[:,:-1]
y = sequences[:,-1]
y = np_utils.to_categorical(y, len_unique_words)

In [19]:
print(x.shape)
print(y.shape)

(153383, 39)
(153383, 6073)


### Building and Training Model

In [20]:
def create_model(max_sequence_len, len_unique_words):
    model = Sequential()
    model.add(Embedding(len_unique_words, 10, input_length=max_sequence_len - 1))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    adam = Adam(learning_rate=0.01)
    model.add(Dense(len_unique_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [21]:
model = create_model(max_sequence_len, len_unique_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 39, 10)            60730     
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 6073)              783417    
                                                                 
Total params: 915,315
Trainable params: 915,315
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(x, y, batch_size = 512, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f5b70c5a990>

### Save Trained Model

In [25]:
model.save('models/lstm')



### Model Inference

In [80]:
model = load_model('models/lstm')

In [81]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        cleaned_data = clean([seed_text])
        sequences= prepare_corpus(cleaned_data[0], word_to_idx)
        sequences = pad_sequences([sequences[-1]], maxlen=max_seq_len-1, padding='pre')
        predict_x=model.predict(sequences, verbose=0) 
        classes_x=np.argmax(predict_x,axis=1)
        output_word = ''
        output_word = idx_to_word[classes_x[0]]            
        seed_text = seed_text + " " + output_word
    
    return seed_text.title()

In [82]:
print(generate_text("Kopi hitam", 20, model, max_sequence_len))

Kopi Hitam Seduh Dengan Rasa Avocado Yang Yummy Dan Sangat Menyegarkan Jika Gelas Oz Suweger Sangat Nikmat Dan Disajikan Dalam Kemasan Ziplock


## Fine-tuned GPT2
Pretrained GPT2 model that will be fine-tuned in this experiment is [gpt2-small-indonesian-522M](https://huggingface.co/cahya/gpt2-small-indonesian-522M) by Cahya Wirawan

### Preprocessing

#### Cleaning Data

In [41]:
df[df.str.contains("Item Oden")]

6686    Item Oden : Fish Stick, Tahu Baso Ikan, Chikuw...
6687    Item Oden : Lobster Ball, Chikuwa Jumbo, Sioma...
6688    Item Oden : Odeng, Spicy Eomuk. Item RTD: Ice ...
6689    Item Oden : Tteokbokki. Item RTD: Ice Honey Yu...
6690    Item Oden : Cheesebokki.  RTD : Ice Honey Yuzu...
6869    Item Oden : Fish Stick, Tahu Baso Ikan, Chikuw...
6870    Item Oden : Lobster Ball, Chikuwa Jumbo, Sioma...
6871    Item Oden : Odeng, Spicy Eomuk. Item RTD: Ice ...
6872    Item Oden : Tteokbokki. Item RTD: Ice Honey Yu...
6873    Item Oden : Cheesebokki.  RTD : Ice Honey Yuzu...
7052    Item Oden : Fish Stick, Tahu Baso Ikan, Chikuw...
7053    Item Oden : Lobster Ball, Chikuwa Jumbo, Sioma...
7054    Item Oden : Odeng, Spicy Eomuk. Item RTD: Ice ...
7055    Item Oden : Tteokbokki. Item RTD: Ice Honey Yu...
7056    Item Oden : Cheesebokki.  RTD : Ice Honey Yuzu...
7235    Item Oden : Fish Stick, Tahu Baso Ikan, Chikuw...
7236    Item Oden : Lobster Ball, Chikuwa Jumbo, Sioma...
7237    Item O

In [42]:
dataset = df[df.str.contains("Item Oden") == False]

In [43]:
print(len(dataset[dataset.str.contains("Item Oden")]))

0


#### Splitting Dataset

In [44]:
x_train, x_validation = train_test_split(dataset, test_size=0.2, random_state=99)

#### Save Splitted Dataset

In [45]:
x_train.to_csv('datasets/train.csv', index=False)
x_validation.to_csv('datasets/validation.csv', index=False)

#### Building Transformers Datasets

In [46]:
datasets = load_dataset("csv", data_files={"train": "datasets/train.csv", "validation": "datasets/validation.csv"})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-290dba31c37f12ca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-290dba31c37f12ca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

#### Tokenization

In [47]:
tokenizer = GPT2TokenizerFast.from_pretrained('cahya/gpt2-small-indonesian-522M')

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/894k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/452k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/573 [00:00<?, ?B/s]

In [48]:
def tokenize_function(examples):
    return tokenizer(examples['description'])

In [49]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["description"])

       

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

In [58]:
tokenized_datasets["train"][8]

{'input_ids': [34, 452, 8969, 16221, 12, 25153, 474, 3420, 12, 33633, 14],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [60]:
tokenizer.decode(tokenized_datasets["train"][8]["input_ids"])

'Bubble Gum, Cocopandan, Susu.'

#### Prepare Input 
(Concatenate all text together and then split in small chunks of a certain `block_size`)

In [61]:
# block_size = tokenizer.model_max_length
block_size = 128

In [62]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [63]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

        

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

In [67]:
len(tokenized_datasets["train"][0]["input_ids"])

17

In [66]:
len(lm_datasets["train"][0]["input_ids"])

128

### Fine-tuning Model

In [68]:
model = AutoModelForCausalLM.from_pretrained('cahya/gpt2-small-indonesian-522M')

Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [71]:
training_args = TrainingArguments(
    "models/gpt2-small-indonesian-522M-finetuned-gofood",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [73]:
trainer.train()

***** Running training *****
  Num examples = 2073
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2600
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,No log,3.528789
2,3.907600,3.047115
3,3.907600,2.785153
4,2.683100,2.627034
5,2.683100,2.526018
6,2.209600,2.454141
7,2.209600,2.409188
8,1.950900,2.376409
9,1.950900,2.359908
10,1.803400,2.355932


***** Running Evaluation *****
  Num examples = 515
  Batch size = 8
Saving model checkpoint to models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-500
Configuration saved in models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-500/config.json
Model weights saved in models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 515
  Batch size = 8
***** Running Evaluation *****
  Num examples = 515
  Batch size = 8
Saving model checkpoint to models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-1000
Configuration saved in models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-1000/config.json
Model weights saved in models/gpt2-small-indonesian-522M-finetuned-gofood/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 515
  Batch size = 8
***** Running Evaluation *****
  Num examples = 515
  Batch size = 8
Saving model checkpoint to models/gpt2-small-indon

TrainOutput(global_step=2600, training_loss=2.482543135422927, metrics={'train_runtime': 1003.587, 'train_samples_per_second': 20.656, 'train_steps_per_second': 2.591, 'total_flos': 1354145955840000.0, 'train_loss': 2.482543135422927, 'epoch': 10.0})

### Save Fine-tuned Model

In [74]:
trainer.save_model('models/finetuned-gpt2')

Saving model checkpoint to models/finetuned-gpt2
Configuration saved in models/finetuned-gpt2/config.json
Model weights saved in models/finetuned-gpt2/pytorch_model.bin


### Model Inference

In [83]:
model = GPT2LMHeadModel.from_pretrained('models/finetuned-gpt2', pad_token_id=tokenizer.eos_token_id)
tokenizer = GPT2TokenizerFast.from_pretrained('cahya/gpt2-small-indonesian-522M')

loading configuration file models/finetuned-gpt2/config.json
Model config GPT2Config {
  "_name_or_path": "cahya/gpt2-small-indonesian-522M",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file models/finetuned-gpt2/pytorch_model.bin
All model

In [76]:
def generate_text_gpt2(text, tokenizer, model, max_len):
    encoded_input = tokenizer.encode(text, return_tensors='pt')

    beam_outputs = model.generate(
        encoded_input, 
        max_length=max_len, 
        num_beams=5,  
        num_return_sequences=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    value = []
    for i, beam_output in enumerate(beam_outputs):
        value.append(tokenizer.decode(beam_output, skip_special_tokens=True))

    return value

In [84]:
results = generate_text_gpt2("Kopi hitam", tokenizer, model, 20)

print("Output:\n" + 100 * '-')
for i, result in enumerate(results):
    print(f'{i+1}: {result}')

Output:
----------------------------------------------------------------------------------------------------
1: Kopi hitam dipadukan dengan gula aren murni, tapioca pearl dan homemade macchiato
2: Kopi hitam dipadukan dengan susu murni, hokkaido milk pudding dan biscuit bis
3: Kopi hitam dipadukan dengan susu murni, hokkaido milk pudding dan homemade
4: Kopi hitam dipadukan dengan gula aren, tapioca pearl dan homemade macchiato br
5: Kopi hitam dipadukan dengan gula aren murni, tapioca pearl, susu murni dan homemade
