In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Perform one-hot encoding on labels
        label_encoder = OneHotEncoder(sparse=False)
        self.labels = label_encoder.fit_transform(labels.values.reshape(-1, 1))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.float32)
        }


In [None]:
df  = pd.read_csv("/content/drive/MyDrive/data_polarity (1).csv")
df

Unnamed: 0,created_at,id,reply_count,retweet_count,favorite_count,username,clean_text,preprocessed_text,polarity_score,polarity
0,Mon May 15 11:22:33 +0000 2023,1658070357826490112,0,0,1,jalan_yuuk,saat kota yang lain sibuk berlomba agar wisata...,"['kota', 'sibuk', 'berlomba', 'wisatawan', 'ko...",-1,negative
1,Mon May 15 11:43:48 +0000 2023,1658075703882170112,0,0,0,zeronol0,tankid laksono lol tolol lu pikir keren gitu ...,"['tankid', 'laksono', 'lol', 'tolol', 'lu', 'p...",-26,urgent
2,Mon May 15 11:07:11 +0000 2023,1658066490258769920,0,0,0,PA_5060_AS,di merauke minyak tanah susah selain jalan jug...,"['merauke', 'minyak', 'tanah', 'susah', 'jalan...",-14,urgent
3,Mon May 15 11:00:55 +0000 2023,1658064915532480000,0,0,1,bagrenresppu,polres ppu polda kaltim wujud kepedulian pols...,"['polres', 'ppu', 'polda', 'kaltim', 'wujud', ...",4,positive
4,Mon May 15 11:15:36 +0000 2023,1658068608453410048,0,0,0,raksulonline,traffic light rusak jalan borong rayabatua ray...,"['traffic', 'light', 'rusak', 'jalan', 'borong...",-5,negative
...,...,...,...,...,...,...,...,...,...,...
3617,Sun Nov 27 23:37:59 +0000 2022,1597011888050959872,0,0,1,AgoiTaufik,hahahahahebat itulah kalau cebong punya hajata...,"['hahahahahebat', 'cebong', 'hajatan', 'ngepra...",-6,negative
3618,Mon Nov 28 00:59:41 +0000 2022,1597032446046109952,0,0,0,RagaSukma_1,yah tweet polri tni cari pesertanya itu d...,"['yah', 'tweet', 'polri', 'tni', 'cari', 'pese...",-21,urgent
3619,Sun Nov 27 06:16:00 +0000 2022,1596749661821280000,131,255,1012,txtdrjkt,yaelah wargaðÿ˜¢ðÿ˜¢,"['yaelah', 'wargaðÿ˜¢ðÿ˜¢']",0,neutral
3620,Sun Nov 27 10:52:33 +0000 2022,1596819259673299968,1,0,2,SupratmanAndang,ternyata cuma menyisakan sampah dimanamana tuk...,"['menyisakan', 'sampah', 'dimanamana', 'tuk', ...",2,positive


In [None]:
train_text = df['preprocessed_text']
train_labels = df['polarity']

In [None]:
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
max_length = 512
train_dataset = SentimentDataset(train_text, train_labels, tokenizer, max_length)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)





In [None]:


# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=2)
model.to(device)
model.classifier.weight.data.normal_(mean=0.0, std=0.02)
model.classifier.bias.data.zero_()

# Define your dataset and dataloader
# Make sure your dataset is formatted as tokenized input sequences with corresponding labels

# Example dataset and dataloader
train_dataset = train_dataset
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Set the number of training epochs and learning rate
num_epochs = 5
learning_rate = 2e-5

# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
