In [1]:
import sys
import numpy as np
import pandas as pd
import os
sys.path.append('..')
from model.bert.bert_textdataset import BertTextDataset
from model.bert.bert_classfier import BertFCClassifier
from model.bert.bert_embedder import BertEmbedder
from model.bert.bert_evaluator import BertFCEvaluator
from model.bert.bert_trainer import BertFCTrainer
from nlp_pipeline.preprocess_text import TextPreprocessor
from sklearn.model_selection import train_test_split
from nlp_pipeline.back_translator import BackTranslationAugmentor
from nlp_pipeline.feature_extraction import FeatureExtractor
from nlp_pipeline.embedding_oversample import EmbeddingOversampler

In [2]:
df = df = pd.read_csv('../data/feature/combined_augmented_asd.csv', encoding='latin1')
text_preprocessor = TextPreprocessor()
df['clean_text'] = df['transcription'].apply(text_preprocessor.preprocess)
df[['transcription', 'clean_text']].head()

Unnamed: 0,transcription,clean_text
0,Tunjuk kaka coba,tunjuk kaka coba
1,Inii!,ini
2,Siapa namamu?,siapa nama
3,Iyaaaa?,
4,Kenalan duluu!,kenal duluu


In [3]:
extractor = FeatureExtractor()
y = (df['label'].str.upper().str.strip() == 'ASD').astype(int).values
X = extractor.encode_series_bert(df['clean_text'][:len(df['label'])])
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

ling_cols = extractor.linguistic_cols
X_ling = df[ling_cols].values[:len(y)]

oversampler = EmbeddingOversampler()
X_embed_bal, X_ling_bal, y_bal = oversampler.oversample(X, X_ling, y)

print(X_embed_bal.shape, y_bal.shape)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

{0: 4648, 1: 3732}
Before Oversampling: Counter({0: 4648, 1: 3732})
After Oversampling : Counter({0: 4648, 1: 4648})
(9296, 768) (9296,)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_embed_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42)
model = BertFCClassifier(input_dim=768, hidden_dim=128, dropout=0.3)

trainer = BertFCTrainer(
    model=model,
    lr=2e-5,
    batch_size=16,
    epochs=20,
    patience=3
)
print (X_train.shape, y_train.shape)

(7436, 768) (7436,)


In [6]:
trainer.train(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val
)
trainer.evaluate(
    X_test=X_val,
    y_test=y_val
)

sudah di update
Epoch 1/20, Train Loss: 281.8739, Val Loss: 0.5478
Epoch 2/20, Train Loss: 238.3306, Val Loss: 0.4957
Epoch 3/20, Train Loss: 217.6688, Val Loss: 0.4651
Epoch 4/20, Train Loss: 207.1474, Val Loss: 0.4477
Epoch 5/20, Train Loss: 200.2890, Val Loss: 0.4358
Epoch 6/20, Train Loss: 193.7885, Val Loss: 0.4257
Epoch 7/20, Train Loss: 189.9683, Val Loss: 0.4190
Epoch 8/20, Train Loss: 186.8127, Val Loss: 0.4142
Epoch 9/20, Train Loss: 183.3509, Val Loss: 0.4093
Epoch 10/20, Train Loss: 180.3725, Val Loss: 0.4053
Epoch 11/20, Train Loss: 178.9800, Val Loss: 0.4031
Epoch 12/20, Train Loss: 176.8720, Val Loss: 0.3978
Epoch 13/20, Train Loss: 175.4824, Val Loss: 0.3958
Epoch 14/20, Train Loss: 173.2166, Val Loss: 0.3916
Epoch 15/20, Train Loss: 171.7455, Val Loss: 0.3877
Epoch 16/20, Train Loss: 169.7259, Val Loss: 0.3863
Epoch 17/20, Train Loss: 168.4914, Val Loss: 0.3827
Epoch 18/20, Train Loss: 167.2040, Val Loss: 0.3820
Epoch 19/20, Train Loss: 165.9503, Val Loss: 0.3783
Epoch