## Import Modules

In [1]:
import os
import pandas as pd
import pathlib
import json
import numpy as np
import json
import struct
import subprocess
import torch
import re
import unicodedata
import nltk
import spacy

from datasets import load_dataset
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    TextDataset,
    EvalPrediction,
    DataCollatorWithPadding,
    GenerationConfig,
    BitsAndBytesConfig,
    MarianMTModel,
    MarianTokenizer,
    T5Tokenizer,
    T5ForConditionalGeneration
)

from nltk.corpus import stopwords
from contractions import fix
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from textblob import TextBlob
from spellchecker import SpellChecker

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'
MODELS = 'D:\\AI\\LLM\\models'

## Set Up Model Path

In [4]:
models = os.listdir(MODEL_PATH)
models

['bart-large-cnn',
 'bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'opus-mt-en-id',
 'opus-mt-id-en',
 't5-base',
 't5-large']

## Select Dataset

In [5]:
counter = 0
n = 20

path_list = []

for dirpath, dirnames, filenames in os.walk(DATASET_PATH):
    for filename in filenames:
        print(dirpath + '/' + filename)
        path_list.append(dirpath + '/' + filename)
    counter += 1
    if counter >= n:
        break

D:\Python\LLM_Environment\datasets/Australian_Legal_Corpus.jsonl
D:\Python\LLM_Environment\datasets/cached_lm_GPT2Tokenizer_128_Shakespeare_Dataset.txt
D:\Python\LLM_Environment\datasets/Customer.csv
D:\Python\LLM_Environment\datasets/Html.csv
D:\Python\LLM_Environment\datasets/Hukum_Online_Klinik.xlsx
D:\Python\LLM_Environment\datasets/Recipes.csv
D:\Python\LLM_Environment\datasets/Recipes_1000.csv
D:\Python\LLM_Environment\datasets/Shakespeare_Dataset.txt
D:\Python\LLM_Environment\datasets/Taylor_Swift_Lyrics.csv
D:\Python\LLM_Environment\datasets/Twitter.csv


In [6]:
file_format = '.xlsx'

file_paths = []

for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(file_format):
            file_paths.append(os.path.join(root, file))
        if len(file_paths) >= n:
            break
    if len(file_paths) >= n:
        break
file_paths[0]

'D:\\Python\\LLM_Environment\\datasets\\Hukum_Online_Klinik.xlsx'

In [7]:
file_path = file_paths[0]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Hukum_Online_Klinik.xlsx'

## Read Dataset

### Read as CSV

In [8]:
try:
    df = pd.read_csv(file_path, encoding = 'ISO-8859-1')#, sep = ';')
    print("DataFrame shape:", df.shape)
    display(df.head())
except:
    pass

### Read as Excel

In [9]:
try:
    df = pd.read_excel(file_path)
    print("DataFrame shape:", df.shape)
    display(df.head())
except:
    pass

DataFrame shape: (8371, 5)


Unnamed: 0,Title,Link,Question,Summary,Answer
0,PengertianOvermachtdanUltimum Remediumdalam Hu...,https://www.hukumonline.com/klinik/a/pengertia...,Bagaimana ketentuanovermachtdalam hukum pidana...,Daya paksa atauovermachtdalam hukum pidana mer...,KLINIK TERKAITDaya Paksa dan Pembelaan Terpaks...
1,Ini 6 Perbedaan CV dan PT yang Wajib Diketahui,https://www.hukumonline.com/klinik/a/perbedaan...,Saya punya tiga pertanyaan. Apa perbedaan PT d...,PT adalah singkatan dari perseroan terbatas ya...,Terima kasih atas pertanyaan Anda.Artikel di b...
2,"KDRT Hingga Meninggal, Penganiayaan atau Pembu...",https://www.hukumonline.com/klinik/a/kdrt-hing...,"Belakangan ini, viral kasus seorang suami di S...",Pasal KDRT atau kekerasan dalam rumah tangga y...,Terima kasih atas pertanyaan Anda.Artikel di b...
3,7 Cara Memperoleh Wilayah dalam Hukum Internas...,https://www.hukumonline.com/klinik/a/7-cara-me...,Konflik Palestina-Israel masih berlarut-larut ...,"Pada dasarnya, terdapat 7 cara memperoleh wila...",Terima kasih atas pertanyaan Anda.Seluruh info...
4,"Pacar Tidak Mau Bertanggung Jawab, Kapan Maksi...",https://www.hukumonline.com/klinik/a/pacar-tid...,Saya ingin menanyakan bila laki-laki dan perem...,Jika terjadi persetubuhan antara sepasang keka...,Terima kasih atas pertanyaan Anda.Artikel di b...


### Read as JSON

In [10]:
counter = 0
row = 20

try:
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            print(data)
            counter += 1

            if counter >= row:
                break
except:
    pass

In [11]:
n = 50
chunk_size = 1000
total_rows_processed = 0

data_frames = []

try:
    with open(file_path, 'r') as file:
        counter = 0

        while counter < n:
            data_list = []

            for _ in range(chunk_size):
                line = file.readline()
                if not line:
                    break

                data = json.loads(line)
                data_list.append(data)

            if data_list:
                df_chunk = pd.DataFrame(data_list)
                data_frames.append(df_chunk)

                print(f'Processed chunk with {len(df_chunk)} rows.')
                total_rows_processed += len(df_chunk)
                counter += 1

            else:
                break

    print(f'Total rows processed: {total_rows_processed}')

    # (Optional) Concatenate all DataFrames if needed
    if data_frames:  # Check if there are any DataFrames to concatenate
        df = pd.concat(data_frames, ignore_index = True)
        print('Concatenated DataFrame shape:', df.shape)
        display(df.head())
except:
    pass

### Read as Parquet

In [12]:
try:
    df = pd.read_parquet(file_path)
    print("DataFrame shape:", df.shape)
    display(df.head())
except:
    pass

## 04. Translation

In [13]:
dfa = df.iloc[:10]
dfa.shape

(10, 5)

In [14]:
models

['bart-large-cnn',
 'bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'opus-mt-en-id',
 'opus-mt-id-en',
 't5-base',
 't5-large']

In [15]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\flan-t5-small'

In [16]:
model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype = torch.float16).to('cuda')
tokenizer = T5Tokenizer.from_pretrained(model_path)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Count Token Length

In [17]:
text = dfa.iloc[0][4]
text

  text = dfa.iloc[0][4]


'KLINIK TERKAITDaya Paksa dan Pembelaan Terpaksa sebagai Alasan Penghapus Pidana11 Jan, 2023Arti Ultimum Remedium sebagai Sanksi Pamungkas15 Jul, 2022Litigasi dan Alternatif Penyelesaian Sengketa di Luar Pengadilan10 Jul, 2020Bisakah Dipidana Karena Menembak Anjing yang Menyerang Orang Lain?22 Okt, 2012Terima kasih atas pertanyaan Anda.Artikel di bawah ini adalah pemutakhiran dari artikel dengan judulTentang\xa0Overmacht\xa0dan Hukum Pidana sebagai\xa0Ultimum Remediumyang dibuat olehAnandito Utomo, S.H.dan dipublikasikan pada 19 Agustus 2016.Belajar Hukum Secara Online dari Pengajar Berkompeten Dengan Biaya TerjangkauMulai DariRp. 149.000Lihat Semua KelasArtikel ini dibuat berdasarkan KUHP lama dan UU 1/2023 tentang KUHP yang diundangkan pada tanggal 2 Januari 2023.Seluruh informasi hukum yang ada di Klinik hukumonline.com disiapkan semata – mata untuk tujuan pendidikan dan bersifat umum (lihatPernyataan Penyangkalanselengkapnya). Untuk mendapatkan nasihat hukum spesifik terhadap kasus

In [18]:
def count_tokens(text):
    tokens = tokenizer(text, return_tensors = "pt", truncation = False).to('cuda')
    return len(tokens['input_ids'][0])

token_length = count_tokens(text)
print(f"Token length: {token_length}")

Token indices sequence length is longer than the specified maximum sequence length for this model (6648 > 512). Running this sequence through the model will result in indexing errors


Token length: 6648


### Clean Text

In [19]:
def clean_text(text):
    text = text.replace('\xa0', ' ').strip()
    text = text.replace('.', '. ')
    text = text.replace('(', ' (')
    text = text.replace(')', ') ')
    text = text.replace('[', ' [')
    text = text.replace(']', '] ')
    text = text.replace(',', ', ')
    text = text.replace(':', ': ')
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    #text = re.sub(r'([A-Za-z])([A-Z]{2,})', r'\1 \2', text)
    text = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', text)
    text = re.sub(r'(\d{1,2})([A-Z][a-z]+),(\s*\d{4})', r'\1 \2 \3', text)
    text = re.sub(r'Mulai DariRp\..*', '', text)
    text = re.sub(r'\d+\.\d+', '', text)

    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    
    return cleaned_text

cleaned_text = clean_text(text)
print(cleaned_text)

KLINIK TERKAITDaya Paksa dan Pembelaan Terpaksa sebagai Alasan Penghapus Pidana11 Jan, 2023 Arti Ultimum Remedium sebagai Sanksi Pamungkas15 Jul, 2022 Litigasi dan Alternatif Penyelesaian Sengketa di Luar Pengadilan10 Jul, 2020 Bisakah Dipidana Karena Menembak Anjing yang Menyerang Orang Lain?22 Okt, 2012 Terima kasih atas pertanyaan Anda. Artikel di bawah ini adalah pemutakhiran dari artikel dengan judul Tentang Overmacht dan Hukum Pidana sebagai Ultimum Remediumyang dibuat oleh Anandito Utomo, S. H. dan dipublikasikan pada 19 Agustus 2016. Belajar Hukum Secara Online dari Pengajar Berkompeten Dengan Biaya Terjangkau Mulai Dari Rp. 149. 000 Lihat Semua Kelas Artikel ini dibuat berdasarkan KUHP lama dan UU 1/2023 tentang KUHP yang diundangkan pada tanggal 2 Januari 2023. Seluruh informasi hukum yang ada di Klinik hukumonline. com disiapkan semata – mata untuk tujuan pendidikan dan bersifat umum (lihat Pernyataan Penyangkalanselengkapnya) . Untuk mendapatkan nasihat hukum spesifik terha

### Translate

In [20]:
text = cleaned_text

1st method

In [21]:
torch.cuda.empty_cache()

In [22]:
model_path = MODEL_PATH + '\\' + models[13]
model_path

'D:\\Python\\LLM_Environment\\models\\opus-mt-id-en'

In [23]:
translator = pipeline("translation", model = model_path, device = 0)

In [24]:
tokenizer = MarianTokenizer.from_pretrained(model_path)

def split_text_into_chunks(text, max_tokens = 400):
    tokens = tokenizer(text, return_tensors = 'pt', truncation = False).to('cuda')['input_ids'][0]
    chunks = [tokenizer.decode(tokens[i:i+max_tokens], skip_special_tokens = True) for i in range(0, len(tokens), max_tokens)]
    return chunks


In [25]:
def translate_large_text(text):
    chunks = split_text_into_chunks(text)
    translated_chunks = [translator(chunk)[0]['translation_text'] for chunk in chunks]
    return ' '.join(translated_chunks)

translated_text = translate_large_text(text)
print(translated_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (2943 > 512). Running this sequence through the model will result in indexing errors


All thanks to the question under this and under this question of the article about Overmah and the Law of the Final Final Resupation created by the Uhtium of the July 11 11 11 11 11 11 11 1, 2023 '23 '23, 2023 2023 '23 '23 '23 '2023 ' cause remove cause removes Ul2222 and removes '20 to remove the Peni under this, 2022 under this, 2022, 2022, 2022, 2022, 2022 of the cause, 2022 lition with an article on the finals with the title of Overstah, 20hhhmah and the final completion as the final completion of the final completion made by the Umah of the Utah of the Utah and  by the Utomo.h.h and the 19. and the 20. .. and the final law learn of the law of fire from the ground of the ground of the ground of the ground of the ground of the ground. God is All-mighty, All- all- powerful. They will come to you, but they will not be able to save themselves. It is not for a soul to be punished by a man, and it is no sin for a man to do what it is necessary (to do that), and if he had been forced to d

In [26]:
token_length = count_tokens(translated_text)
print(f"Token length: {token_length}")

Token length: 858


In [27]:
def split_text_into_chunks(text, max_tokens = 400):
    tokens = tokenizer(text, return_tensors = 'pt', truncation = False)['input_ids'][0]
    token_length = len(tokens)

    if token_length <= max_tokens:
        return [text]

    chunks = []
    for i in range(0, token_length, max_tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens = True)
        chunks.append(chunk_text)
    
    return chunks

In [28]:
def translate_large_text(text):
    chunks = split_text_into_chunks(text)
    translated_chunks = [translator(chunk)[0]['translation_text'] for chunk in chunks]
    return ' '.join(translated_chunks)

translated_text = translate_large_text(text)
print(translated_text)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


All thanks to the question under this and under this question of the article about Overmah and the Law of the Final Final Resupation created by the Uhtium of the July 11 11 11 11 11 11 11 1, 2023 '23 '23, 2023 2023 '23 '23 '23 '2023 ' cause remove cause removes Ul2222 and removes '20 to remove the Peni under this, 2022 under this, 2022, 2022, 2022, 2022, 2022 of the cause, 2022 lition with an article on the finals with the title of Overstah, 20hhhmah and the final completion as the final completion of the final completion made by the Umah of the Utah of the Utah and  by the Utomo.h.h and the 19. and the 20. .. and the final law learn of the law of fire from the ground of the ground of the ground of the ground of the ground of the ground. God is All-mighty, All- all- powerful. They will come to you, but they will not be able to save themselves. It is not for a soul to be punished by a man, and it is no sin for a man to do what it is necessary (to do that), and if he had been forced to d

In [29]:
token_length = count_tokens(translated_text)
print(f"Token length: {token_length}")

Token length: 858


In [30]:
def translate_large_text(text):
    chunks = split_text_into_chunks(text)
    translated_chunks = [translator(chunk)[0]['translation_text'] for chunk in chunks]
    return ' '.join(translated_chunks)

translated_text = translate_large_text(text)
print(translated_text)

All thanks to the question under this and under this question of the article about Overmah and the Law of the Final Final Resupation created by the Uhtium of the July 11 11 11 11 11 11 11 1, 2023 '23 '23, 2023 2023 '23 '23 '23 '2023 ' cause remove cause removes Ul2222 and removes '20 to remove the Peni under this, 2022 under this, 2022, 2022, 2022, 2022, 2022 of the cause, 2022 lition with an article on the finals with the title of Overstah, 20hhhmah and the final completion as the final completion of the final completion made by the Umah of the Utah of the Utah and  by the Utomo.h.h and the 19. and the 20. .. and the final law learn of the law of fire from the ground of the ground of the ground of the ground of the ground of the ground. God is All-mighty, All- all- powerful. They will come to you, but they will not be able to save themselves. It is not for a soul to be punished by a man, and it is no sin for a man to do what it is necessary (to do that), and if he had been forced to d

In [31]:
token_length = count_tokens(translated_text)
print(f"Token length: {token_length}")

Token length: 858


2nd method

In [32]:
torch.cuda.empty_cache()

In [33]:
model_path = MODEL_PATH + '\\' + models[7]
model_path

'D:\\Python\\LLM_Environment\\models\\flan-t5-large'

In [34]:
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0)

In [35]:
def split_text_into_chunks(text, max_tokens=512):
    tokens = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
    token_length = len(tokens)

    if token_length <= max_tokens:
        return [text]

    chunks = []
    for i in range(0, token_length, max_tokens):
        chunk_tokens = tokens[i:min(i + max_tokens, token_length)]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    return chunks

def translate_large_text(text):
    chunks = split_text_into_chunks(text)
    translated_chunks = [translator(chunk, max_length=512)[0]['translation_text'] for chunk in chunks]
    return ' '.join(translated_chunks)

translated_text = translate_large_text(text)
print(translated_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (6574 > 512). Running this sequence through the model will result in indexing errors
Your input_length: 513 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 514 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 513 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 513 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 514 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 514 is bigger than

KUHP yang diundangkan pad tanggal 2 Januari 2023. 62. kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada kekuatan yang tidak ada A tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemudian tidak ada kemud

In [36]:
token_length = count_tokens(translated_text)
print(f"Token length: {token_length}")

Token length: 3946


In [37]:
translator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [38]:
def split_text_into_chunks(text, max_tokens=512):
    tokens = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
    token_length = len(tokens)

    if token_length <= max_tokens:
        return [text]

    chunks = []
    for i in range(0, token_length, max_tokens):
        chunk_tokens = tokens[i:min(i + max_tokens, token_length)]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    return chunks

def translate_large_text(text):
    chunks = split_text_into_chunks(text)
    formatted_chunks = [f"translate Indonesian to English: {chunk}" for chunk in chunks]
    translated_chunks = [translator(chunk, max_length=512)[0]['generated_text'] for chunk in formatted_chunks]
    return ' '.join(translated_chunks)


translated_text = translate_large_text(text)
print(translated_text)

KLINIK TERTAITDaya Paksa and Preparation for the Exam11 Jan, 2023 Arti Ultimum Remedium as Sanksi Pamungkas15 Jul, 2022 Literature and Alternatives for the Selection of Sengketa in the Luar of Selection10 Jul, 2020 Death of the Person Who Was Trying to Kill Others?22 Okt, 2012 Thank you for your question. Article below is a summary of the article with the jurisprudence of the Overmacht and the Law School as the Ultimum Remedium that was written by Anandito Utomo, S. H. and published on 19 August 2016. Online Law School from the Competition Competition with the Market from Rp. 149. 000 All the books are available in the Klinik hukumonline. com. To get the specific legal information about your case, consult with the Consultor Mitra Justika. uanovermachtduring the hukum pidana, so we pay attention to the most common overmacht. Alfitradalam bulunan Hapusnya Hak Menuntut dan Menjalankan Pidana (hal. 63) explains that in Memorie van Toelichting (“Wv T”), daya paksa (overmacht) dijelaskan seb

In [39]:
token_length = count_tokens(translated_text)
print(f"Token length: {token_length}")

Token length: 4912
