In [None]:
import pandas as pd
import glob

# Path where your CSV files are stored
path = "NLP Dataset/"

# Get all CSV files in the folder
all_files = glob.glob(path + "/*.csv")

# Read and concatenate all CSV files
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Save to a single CSV
df.to_csv("merged Dataset.csv", index=False)

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.2 kB ? eta -:--:--
     --------------------------- ---------- 30.7/42.2 kB 445.2 kB/s eta 0:00:01
     -------------------------------------- 42.2/42.2 kB 409.4 kB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metad


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\ZerX\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
df.drop(columns=["Pelajaran", "tipe_kesalahan"])

In [None]:
import pandas as pd
import random

# Example: load your dataframe
# df = pd.read_csv("your_dataset.csv")

# Function to inject artificial misspacing
def inject_misspace(sentence, prob=0.3):
    words = sentence.split()
    new_words = []
    
    i = 0
    while i < len(words):
        if i < len(words) - 1 and random.random() < prob:
            # Merge current word with the next one
            new_words.append(words[i] + words[i+1])
            i += 2
        else:
            new_words.append(words[i])
            i += 1

    # Random split inside a word (less frequent)
    for j in range(len(new_words)):
        if len(new_words[j]) > 4 and random.random() < 0.1:
            pos = random.randint(1, len(new_words[j]) - 2)
            new_words[j] = new_words[j][:pos] + " " + new_words[j][pos:]
    
    return " ".join(new_words)

# Create augmented data
augmented = df["kalimat_salah"].apply(inject_misspace)

# Append to dataframe with label for clean vs augmented
df_aug = df.copy()
df_aug["kalimat_salah"] = augmented
df_aug["augmented"] = True

df["augmented"] = False

# Combine original + augmented
df_final = pd.concat([df, df_aug], ignore_index=True)

# Example: save
# df_final.to_csv("dataset_with_misspaces.csv", index=False)

print(df_final.head(10))

In [None]:
df_final.drop(columns=["Pelajaran", "tipe_kesalahan", "augmented"])

In [None]:
df_final.to_csv("df_final.csv", index=False)

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# =========================
# 1. Load and Prepare Data
# =========================
# CSV must have "kalimat_salah" and "kalimat_benar"
df = pd.read_csv("dataset_with_missspaces.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# ---- Split into 70/20/10 ----
dataset_split = dataset.train_test_split(test_size=0.3, seed=42)
temp_split = dataset_split["test"].train_test_split(test_size=1/3, seed=42)

dataset_final = {
    "train": dataset_split["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
}

print(dataset_final)

# =========================
# 2. Tokenizer and Model
# =========================
model_name = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encoder-decoder setup
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

# =========================
# 3. Preprocess Function
# =========================
max_length = 64

def preprocess(batch):
    inputs = tokenizer(batch["kalimat_salah"], 
                       truncation=True, 
                       padding="max_length", 
                       max_length=max_length)
    outputs = tokenizer(batch["kalimat_benar"], 
                        truncation=True, 
                        padding="max_length", 
                        max_length=max_length)

    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = {}
for split in ["train", "validation", "test"]:
    tokenized_dataset[split] = dataset_final[split].map(preprocess, batched=True, remove_columns=df.columns)

# =========================
# 4. Training Setup
# =========================
batch_size = 16

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./indoBERT-corrector",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,          # keep only 2 best checkpoints
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir="./logs",
    load_best_model_at_end=True, # <-- saves best checkpoint
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# =========================
# 5. Trainer
# =========================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# =========================
# 6. Train Model
# =========================
trainer.train()

# =========================
# 7. Evaluate on Test Set
# =========================
results = trainer.evaluate(tokenized_dataset["test"])
print("Test results:", results)

# =========================
# 8. Save Final Best Model
# =========================
trainer.save_model("./indoBERT-best-corrector")
tokenizer.save_pretrained("./indoBERT-best-corrector")

KeyboardInterrupt: 