In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import random
from sklearn.model_selection import KFold

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
df = pd.read_csv('dataset/informal_formal.csv', delimiter=';',)
df

Unnamed: 0,informal,formal
0,alhamdulillah stlh libur xxxnumberxxx hari onb...,alhamdulillah setelah libur xxxnumberxxx hari ...
1,selamat sore min . saya mau pesan tiket ka via...,selamat sore admin . saya mau pesan tiket ka v...
2,iya kak terimakasih . tapi tadi sudah datang k...,iya kak terima kasih . tetapi tadi sudah datan...
3,malam min xxxuserxxx xxxuserxxx situs kalian e...,"malam admin xxxuserxxx xxxuserxxx , apakah sit..."
4,"min pembelian token pln apa ada kendala , ini ...","admin , pembelian token pln apa ada kendala ? ..."
...,...,...
1917,"halo bni , ini maintenance m banking nya seles...","halo bni , ini maintenance m - bankingnya sele..."
1918,"malam , mau komplain . paket statusnya sudah s...","malam , saya mau komplain . paket statusnya su..."
1919,sepertinya merchant emang sengaja pada gak mau...,sepertinya merchant memang sengaja tidak mau m...
1920,yaallah kaka kelas sma gua viral anjerr,ya allah kakak kelas sma saya viral !


In [4]:
df["set"] = df["informal"] + " <TWITHONBOT> " + df["formal"]
df['set'][122]

'min xxxuserxxx tolong respon dm saya dong <TWITHONBOT> admin xxxuserxxx tolong respon dm saya .'

In [5]:
len(df)

1922

In [6]:
#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 250)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

In [7]:
test_set

Unnamed: 0,index,informal,formal,set
0,283,"halo maaf mau tanya , hape saya tiba tiba kart...","halo maaf mau tanya , handphone saya tiba - ti...","halo maaf mau tanya , hape saya tiba tiba kart..."
1,566,ternuata memang sinyal anda ga bisa masuk kama...,ternyata memang sinyal anda tidak bisa masuk k...,ternuata memang sinyal anda ga bisa masuk kama...
2,1089,"ga jadi min , tadi belum klaim voucher free on...","tidak jadi min , tadi belum klaim voucher grat...","ga jadi min , tadi belum klaim voucher free on..."
3,653,aku baru ngeh sekarang ovo tak kira oppo,"aku baru sadar sekarang ovo , kukira oppo .",aku baru ngeh sekarang ovo tak kira oppo <TWIT...
4,709,"tlg kalo ngasih voucher yg jelas donk , ini xx...",tolong kalau kasih voucher yang jelas dong . i...,"tlg kalo ngasih voucher yg jelas donk , ini xx..."
...,...,...,...,...
245,1108,xxxnumberxxx xxxnumberxxx rb ikhlasin aja sih ...,xxxnumberxxx ribu ikhlaskan saja kak hehe . mu...,xxxnumberxxx xxxnumberxxx rb ikhlasin aja sih ...
246,1789,tapi tadi kok masih mendlap mendlep ya sinyaln...,tetapi kenapa tadi sinyalnya masih tersendat -...,tapi tadi kok masih mendlap mendlep ya sinyaln...
247,595,invite donk biar bisa pm . terima kasih,"tolong undang , agar dapat mengirim pm . terim...",invite donk biar bisa pm . terima kasih <TWITH...
248,1294,"cek saha ma , so tau kamu mah","periksa apa , sok tahu kamu .","cek saha ma , so tau kamu mah <TWITHONBOT> per..."


In [8]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/gpt2-small-indonesian')
model = GPT2Model.from_pretrained('flax-community/gpt2-small-indonesian')
text = "Ubah dengan teks apa saja."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at flax-community/gpt2-small-indonesian were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

In [10]:
test_text = "halo ini adalah bahasa indonesia"

In [12]:
from transformers import GPT2Tokenizer, TFGPT2Model

model_name='cahya/gpt2-small-indonesian-522M'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2Model.from_pretrained(model_name)
text = "Silakan diganti dengan text apa saja."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at cahya/gpt2-small-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [None]:
generated_lyrics = text_generation(test_set)

In [16]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/gpt2-small-indonesian')
model = GPT2Model.from_pretrained('flax-community/gpt2-small-indonesian')

Some weights of the model checkpoint at flax-community/gpt2-small-indonesian were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/gpt2-small-indonesian')
model = GPT2Model.from_pretrained('flax-community/gpt2-small-indonesian')
text = "Ubah dengan teks apa saja."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

print(output)

Some weights of the model checkpoint at flax-community/gpt2-small-indonesian were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.0243, -0.0605,  0.0373,  ...,  0.0103,  0.0534, -0.0128],
         [ 0.2732, -0.0324, -0.0040,  ..., -0.0562,  0.0267,  0.0755],
         [ 0.0856, -0.2399,  0.0405,  ..., -0.0944,  0.0725, -0.1918],
         ...,
         [ 0.1245, -0.0515, -0.0060,  ...,  0.0040, -0.0769, -0.0193],
         [ 0.1410,  0.0267, -0.0384,  ...,  0.0701, -0.0104, -0.1252],
         [-0.1113,  0.1198, -0.0190,  ..., -0.0569,  0.0683, -0.0039]]],
       grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-0.9720, -2.1799,  4.9807,  ..., -4.3905,  0.1080,  2.0316],
          [ 0.9425, -2.4704,  4.4142,  ..., -2.8004, -2.5450,  1.1033],
          [-0.5385, -1.9001,  3.7573,  ..., -4.0379, -3.0310,  2.3813],
          ...,
          [-0.4384, -0.1449,  4.5597,  ..., -4.9888, -2.1237,  1.9565],
          [ 1.3440, -1.6947,  5.0947,  ..., -1.4272, -4.7468,  0.4334],
          [-0.4054, -1.5495,  4.0458,  ..., -3.1271, -2.3324,  2.7667]],