In [1]:
import pandas as pd
import glob

# Path where your CSV files are stored
path = "NLP Dataset/"

# Get all CSV files in the folder
all_files = glob.glob(path + "/*.csv")

# Read and concatenate all CSV files
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Save to a single CSV
df.to_csv("merged Dataset.csv", index=False)

In [2]:
df.drop(columns=["Pelajaran", "tipe_kesalahan"])

Unnamed: 0,kalimat_awal,kalimat_salah
0,Setelah mempelajari bab ini siswa harus mampu ...,Setelah mempelajari bab ini siswa harus mampu ...
1,Guru membacakan pengantar lalu mengumpulkan ja...,Gurun membacakan pengantar lalu mengumpulkan j...
2,Guru memberi contoh menyanyikan lagu di atas d...,Guru memberi contoh menyanyikan lagu di atas d...
3,Selanjutnya guru meminta siswa menirukan dan m...,Selanjutnya guru meminta siswa menirukan dana ...
4,Guru membacakan permintaan dan memberi contoh.,Guru membacakan permintaan dana memberi contoh.
...,...,...
322495,Mereka menduduki beberapa hotel di medan.,Mereka menduduki beberapa hootel di medan.
322496,Pasukan inggris bertugas untuk membebaskan ten...,Pasukan inggris bertugas untuk membebaskan ten...
322497,"Para tawanan dari daerah rantau prapat, pemata...","Para tawanan dari daerah rantau prapat, pemata..."
322498,Ternyata kelompok tawanan itu dibentuk menjadi...,Ternyata kelompok tawanan itu dibentuk menjadi...


In [3]:
import pandas as pd
import random

# Example: load your dataframe
# df = pd.read_csv("your_dataset.csv")

# Function to inject artificial misspacing
def inject_misspace(sentence, prob=0.3):
    words = sentence.split()
    new_words = []
    
    i = 0
    while i < len(words):
        if i < len(words) - 1 and random.random() < prob:
            # Merge current word with the next one
            new_words.append(words[i] + words[i+1])
            i += 2
        else:
            new_words.append(words[i])
            i += 1

    # Random split inside a word (less frequent)
    for j in range(len(new_words)):
        if len(new_words[j]) > 4 and random.random() < 0.1:
            pos = random.randint(1, len(new_words[j]) - 2)
            new_words[j] = new_words[j][:pos] + " " + new_words[j][pos:]
    
    return " ".join(new_words)

# Create augmented data
augmented = df["kalimat_salah"].apply(inject_misspace)

# Append to dataframe with label for clean vs augmented
df_aug = df.copy()
df_aug["kalimat_salah"] = augmented
df_aug["augmented"] = True

df["augmented"] = False

# Combine original + augmented
df_final = pd.concat([df, df_aug], ignore_index=True)

# Example: save
# df_final.to_csv("dataset_with_misspaces.csv", index=False)

print(df_final.head(10))

                                        kalimat_awal  \
0  Setelah mempelajari bab ini siswa harus mampu ...   
1  Guru membacakan pengantar lalu mengumpulkan ja...   
2  Guru memberi contoh menyanyikan lagu di atas d...   
3  Selanjutnya guru meminta siswa menirukan dan m...   
4     Guru membacakan permintaan dan memberi contoh.   
5  Lalu guru menuntun siswa menuliskan nomor pili...   
6  Siswa menebak warna, rasa, dan bau tanpa melih...   
7  Guru menjelaskan hal-hal yang boleh dilakukan ...   
8  Misalnya meraba, mencium, atau mencicipi (maka...   
9  Setelah kegiatan guru mengajukan berbagai pert...   

                                       kalimat_salah Pelajaran  \
0  Setelah mempelajari bab ini siswa harus mampu ...       IPA   
1  Gurun membacakan pengantar lalu mengumpulkan j...       IPA   
2  Guru memberi contoh menyanyikan lagu di atas d...       IPA   
3  Selanjutnya guru meminta siswa menirukan dana ...       IPA   
4    Guru membacakan permintaan dana memberi contoh. 

In [4]:
df_final = df_final.drop(columns=["Pelajaran", "tipe_kesalahan", "augmented"])

In [5]:
df_final.to_csv("df_final.csv", index=False)

In [6]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# =========================
# 1. Load and Prepare Data
# =========================
# CSV must have "kalimat_salah" and "kalimat_benar"
df = pd.read_csv("df_final.csv")
df = df.rename(columns={"kalimat_awal": "kalimat_benar"})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# ---- Split into 70/20/10 ----
dataset_split = dataset.train_test_split(test_size=0.3, seed=42)
temp_split = dataset_split["test"].train_test_split(test_size=1/3, seed=42)

dataset_final = {
    "train": dataset_split["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
}

print(dataset_final)

# =========================
# 2. Tokenizer and Model
# =========================
model_name = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encoder-decoder setup (BERT2BERT)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

# ---- IMPORTANT: set special tokens ----
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# =========================
# 3. Preprocess Function
# =========================
max_length = 64

def preprocess(batch):
    inputs = tokenizer(
        batch["kalimat_salah"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    outputs = tokenizer(
        batch["kalimat_benar"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = {}
for split in ["train", "validation", "test"]:
    tokenized_dataset[split] = dataset_final[split].map(
        preprocess,
        batched=True,
        remove_columns=list(df.columns)
    )

# =========================
# 4. Training Setup
# =========================
batch_size = 16

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

try:
    training_args = Seq2SeqTrainingArguments(
        output_dir="./indoBERT-corrector",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        remove_unused_columns=True
    )
except TypeError:
    # fallback for older transformers versions
    training_args = Seq2SeqTrainingArguments(output_dir="./indoBERT-corrector")
    training_args.evaluation_strategy = "epoch"
    training_args.save_strategy = "epoch"
    training_args.learning_rate = 5e-5
    training_args.per_device_train_batch_size = batch_size
    training_args.per_device_eval_batch_size = batch_size
    training_args.weight_decay = 0.01
    training_args.save_total_limit = 2
    training_args.num_train_epochs = 3
    training_args.predict_with_generate = True
    training_args.logging_dir = "./logs"
    training_args.load_best_model_at_end = True
    training_args.metric_for_best_model = "eval_loss"
    training_args.greater_is_better = False
    training_args.remove_unused_columns = True

# =========================
# 5. Trainer
# =========================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# =========================
# 6. Train Model
# =========================
trainer.train()

# =========================
# 7. Evaluate on Test Set
# =========================
results = trainer.evaluate(tokenized_dataset["test"])
print("Test results:", results)

# =========================
# 8. Save Final Best Model
# =========================
trainer.save_model("./indoBERT-best-corrector")
tokenizer.save_pretrained("./indoBERT-best-corrector")


{'train': Dataset({
    features: ['kalimat_benar', 'kalimat_salah'],
    num_rows: 451500
}), 'validation': Dataset({
    features: ['kalimat_benar', 'kalimat_salah'],
    num_rows: 129000
}), 'test': Dataset({
    features: ['kalimat_benar', 'kalimat_salah'],
    num_rows: 64500
})}


Some weights of BertLMHeadModel were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.b

Map:   0%|          | 0/451500 [00:00<?, ? examples/s]

Map:   0%|          | 0/129000 [00:00<?, ? examples/s]

Map:   0%|          | 0/64500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
500,1.0626
1000,0.4387
1500,0.2923
2000,0.2347
2500,0.1849
3000,0.1653
3500,0.1464
4000,0.1277
4500,0.1178
5000,0.1133




Test results: {'eval_loss': 0.006391606293618679, 'eval_runtime': 231.3523, 'eval_samples_per_second': 278.796, 'eval_steps_per_second': 17.428, 'epoch': 3.0}


('./indoBERT-best-corrector/tokenizer_config.json',
 './indoBERT-best-corrector/special_tokens_map.json',
 './indoBERT-best-corrector/vocab.txt',
 './indoBERT-best-corrector/added_tokens.json',
 './indoBERT-best-corrector/tokenizer.json')