In [None]:
import pandas as pd

df = pd.read_csv("data/tiktok_comments_final.csv")
df

In [None]:
labeled_df = pd.read_csv("data/pre-trained-data.csv")
labeled_df = labeled_df.dropna(subset=['text', 'sentiment'])
df = df.dropna(subset=['text'])

labeled_data = [{"content": str(text), "label": int(label)} for text, label in zip(labeled_df["text"], labeled_df["sentiment"])]
unlabeled_data = [{"content": str(text)} for text in df["text"]]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. Model pretrained IndoBERT
MODEL_NAME = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# 2. Convert ke Dataset HuggingFace
labeled_ds = Dataset.from_list(labeled_data)
unlabeled_ds = Dataset.from_list(unlabeled_data)

# 3. Tokenization
def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, padding="max_length", max_length=128)

labeled_ds = labeled_ds.map(tokenize, batched=True)
unlabeled_ds = unlabeled_ds.map(tokenize, batched=True)

# 4. Fine-tuning awal
training_args = TrainingArguments(
    output_dir="./train-checkpoint",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=labeled_ds,
    tokenizer=tokenizer,
)

trainer.train(resume_from_checkpoint="./train-checkpoint/checkpoint-189")
trainer.save_model("./indobert-finetuned")

In [9]:
preds = trainer.predict(unlabeled_ds)

In [12]:
import torch
import torch.nn.functional as F

probs = F.softmax(torch.tensor(preds.predictions), dim=1) 
weights = torch.tensor([-1.0, 0.0, 1.0]) 
sentiment_scores = torch.matmul(probs, weights)

# labeling
pred_indices = torch.argmax(probs, dim=1)
label_map = {0: "negative", 1: "neutral", 2: "positive"}

df["sentiment_score"] = sentiment_scores.numpy()
df["sentiment_label"] = [label_map[i.item()] for i in pred_indices]
df.to_csv("data/tiktok_comments_with_sentiment.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment_score"] = sentiment_scores.numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment_label"] = [label_map[i.item()] for i in pred_indices]


In [13]:
df["sentiment_label"].value_counts()

sentiment_label
positive    2380
neutral     1331
negative     658
Name: count, dtype: int64