# Preprocessing

In [11]:
import pandas as pd

# Load all uploaded datasets
guts_df = pd.read_csv("E:/Coding/persona-ai/datasets/guts_dataset.csv")
health_df = pd.read_csv("E:/Coding/persona-ai/datasets/health_dataset.csv")
kindness_df = pd.read_csv("E:/Coding/persona-ai/datasets/kindness_dataset.csv")
knowledge_df = pd.read_csv("E:/Coding/persona-ai/datasets/knowledge_dataset.csv")
proficiency_df = pd.read_csv("E:/Coding/persona-ai/datasets/proficiency_dataset.csv")
charm_df = pd.read_csv("E:/Coding/persona-ai/datasets/charm_dataset.csv")

# Tambahkan label kategori ke masing-masing dataset
guts_df["category"] = "Guts"
health_df["category"] = "Health"
kindness_df["category"] = "Kindness"
knowledge_df["category"] = "Knowledge"
proficiency_df["category"] = "Proficiency"
charm_df["category"] = "Charm"

# Gabungkan semua dataset jadi satu
all_data = pd.concat([guts_df, health_df, kindness_df, knowledge_df, proficiency_df, charm_df], ignore_index=True)

# Lihat 5 data teratas
all_data.head()

Unnamed: 0,text_aktivitas,kategori,tingkat_aktivitas,category
0,Saya keluar dari zona nyaman saya dan meningga...,Guts,Menengah,Guts
1,Saya berani mengubah hidup saya dengan berpind...,Guts,Berat,Guts
2,Saya mengambil risiko besar dalam hidup saya k...,Guts,Ringan,Guts
3,Saya percaya bahwa langkah besar saya akan mem...,Guts,Ringan,Guts
4,Saya melangkah maju meskipun saya merasa takut...,Guts,Menengah,Guts


## Encode Label Category

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
all_data["label"] = label_encoder.fit_transform(all_data["category"])

## Split Data

In [13]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    all_data["text_aktivitas"].tolist(),
    all_data["label"].tolist(),
    test_size=0.2,
    stratify=all_data["label"],
    random_state=42
)

# Tokenize with BERT

In [14]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Train Model

In [15]:
import tensorflow as tf

def convert_to_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"]
        },
        labels
    ))

train_dataset = convert_to_tf_dataset(train_encodings, train_labels).shuffle(1000).batch(16)
test_dataset = convert_to_tf_dataset(test_encodings, test_labels).batch(16)

In [16]:
from transformers import TFBertForSequenceClassification

num_labels = len(label_encoder.classes_)

model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=num_labels
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.metrics.SparseCategoricalAccuracy("accuracy")

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [10]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5  # Bisa disesuaikan
)

Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
model.save_pretrained("./persona_classifier")
tokenizer.save_pretrained("./persona_classifier")

('./persona_classifier\\tokenizer_config.json',
 './persona_classifier\\special_tokens_map.json',
 './persona_classifier\\vocab.txt',
 './persona_classifier\\added_tokens.json')