In [1]:
import pandas as pd

df = pd.read_csv("data/tiktok_comments_final.csv")
emoji_translator_df = pd.read_csv("data/emoji_translator.csv")

df = df[df['text'].str.replace(r'[^\w\s]', '', regex=True).str.strip() != '']
df.reset_index(drop=True, inplace=True)

emoji_dict = dict(zip(emoji_translator_df["emoji"], emoji_translator_df["aliases_all"]))
def translate_emojis(text):
    for emoji, aliases in emoji_dict.items():
        text = text.replace(emoji, f" ({aliases})")
    return text

df.dropna(subset=["text"], inplace=True)
df["text"] = df["text"].apply(translate_emojis)
df

Unnamed: 0,text,createTime
0,gibran:,1764933105
1,ini emg harusnya ganti rakyat sih. bgnian emg ...,1764928295
2,wapres aja gak pakek rompi anti peluru,1764922201
3,di indo doang presiden n wapres di hujat wkwk,1764925978
4,"di X parah bett anjir komenannya, merinding la...",1764922173
...,...,...
4173,dy emg bnr2 idola emak2 indonesia (bahagia)\r\...,1764986048
4175,sip saya catat,1764985783
4176,"serba salah banget, keliatan dikata pencitraan...",1764985734
4177,"mas wapres panen pahala dari para hatter,\r\ns...",1764985669


In [2]:
labeled_df = pd.read_csv("data/pre-trained-data.csv")
labeled_df = labeled_df.dropna(subset=['text', 'sentiment'])
df = df.dropna(subset=['text'])

labeled_data = [{"content": str(text), "label": int(label)} for text, label in zip(labeled_df["text"], labeled_df["sentiment"])]
unlabeled_data = [{"content": str(text)} for text in df["text"]]

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. Model pretrained IndoBERT
MODEL_NAME = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# 2. Convert ke Dataset HuggingFace
labeled_ds = Dataset.from_list(labeled_data)
unlabeled_ds = Dataset.from_list(unlabeled_data)

# 3. Tokenization
def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, padding="max_length", max_length=128)

labeled_ds = labeled_ds.map(tokenize, batched=True)
unlabeled_ds = unlabeled_ds.map(tokenize, batched=True)

# 4. Fine-tuning awal
training_args = TrainingArguments(
    output_dir="./train-checkpoint",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=labeled_ds,
    tokenizer=tokenizer,
)

trainer.train(resume_from_checkpoint="./train-checkpoint/checkpoint-189")
trainer.save_model("./indobert-finetuned")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 993/993 [00:00<00:00, 1679.93 examples/s]
Map: 100%|██████████| 4145/4145 [00:01<00:00, 3176.90 examples/s]
  trainer = Trainer(


Step,Training Loss


In [4]:
preds = trainer.predict(unlabeled_ds)



In [5]:
import torch
import torch.nn.functional as F

probs = F.softmax(torch.tensor(preds.predictions), dim=1) 
weights = torch.tensor([-1.0, 0.0, 1.0]) 
sentiment_scores = torch.matmul(probs, weights)

# labeling
pred_indices = torch.argmax(probs, dim=1)
label_map = {0: "negative", 1: "neutral", 2: "positive"}

df["sentiment_score"] = sentiment_scores.numpy()
df["sentiment_label"] = [label_map[i.item()] for i in pred_indices]
df.to_csv("data/tiktok_comments_with_sentiment.csv", index=False)

In [6]:
df["sentiment_label"].value_counts()

sentiment_label
positive    2258
neutral     1304
negative     583
Name: count, dtype: int64