In [2]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# displaying strings in all col with 255 range
pd.set_option("display.max_colwidth", 155)


In [35]:
# using transformer model to clasifying the tweets dataset (indoBERT transform model, paper availabel)
tokenizer = AutoTokenizer.from_pretrained("afbudiman/indobert-classification")
model = AutoModelForSequenceClassification.from_pretrained(
    "afbudiman/indobert-classification"
)


Downloading (…)okenizer_config.json: 100%|██████████| 522/522 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 229k/229k [00:00<00:00, 395kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 709k/709k [00:00<00:00, 1.78MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 113kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.09k/1.09k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 498M/498M [04:43<00:00, 1.76MB/s]


In [25]:
# read the dataset that i've got from crawling using snscrape library, dataset crawled from 01-04-2020 - 01-04-2022
df = pd.read_csv("../dataset/twitter_ppkm_dataset.csv", sep="\t")


In [26]:
df["Sentiment"] = " "


### Labeling Tweets Dataset; Model = indoBert


In [37]:
for index, row in df.iterrows():
    tweet = row["Tweet"]

    encoded_input = tokenizer.encode_plus(
        tweet,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    with torch.no_grad():
        input_ids = encoded_input["input_ids"].to(device)
        attention_mask = encoded_input["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_label = torch.argmax(logits, dim=1).item()
    df.at[index, "sentiment"] = predicted_label


In [38]:
df.to_csv(
    r"../dataset/tweets_labeled_new.csv",
    index=False,
    sep="\t",
    encoding="utf-8",
    header="true",
)


In [47]:
df["sentiment"] = df["sentiment"].astype(int)


In [48]:
df.groupby(["sentiment"]).count()


Unnamed: 0_level_0,Date,User,Tweet,Sentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1860,1860,1860,1860
1,15309,15309,15309,15309
2,3874,3874,3874,3874


In [3]:
df["Sentiment"] = df["sentiment"].astype(int)


### **_Prune Dataset To 10.000_**

> _cut the number of datasets to be used from 20 thousand to 10 thousand_


_<p>code di bawah berikut berfungsi untuk merubah label pada dataset yang telah dilabeli sebelumnya dan mengatur ukuran dataset sebelumnya ke jumlah baru yang diinginkan pada dataset baru di bawah ini</p>_

**_<p>label : 1 (positive) & 0 (negative)</p>_**


In [5]:
df = pd.read_excel(
    "../dataset/INA_TweetsPPKM_Labeled_V1.xlsx", sheet_name="tweets_labeled"
)
df.head()


Unnamed: 0,Date,User,Tweet,Sentiment
0,2022-03-26 12:51:44,CendekiaDelapan,"Karena stimulasi untuk #anakberkebutuhankhusus tidak optimal jika dilakukan secara online.\n. \nUntuk memenuhi kebutuhan tersebut, #RumahCendekia8 teta...",2
1,2022-03-23 01:10:09,ViantAntony,Ye ampun mbh anda seorang Wapres loh.&amp;Gelar anda seorang kiyai @Kiyai_MarufAmin Knp anda terus membebani rakyatnya dengan cara2 licik sperti ini.N ...,2
2,2022-03-22 21:10:19,mtaufikmJKT48,Mulaik kumat lagi. 😀 #ppkm https://t.co/MNDSPHJU69,2
3,2022-03-22 15:43:24,PunditPintar,"Welcome PPKM bau bau naik level nih, gimana nih mang\n#ppkm",2
4,2022-03-22 15:30:34,PYI_Indonesia,"Sesungguhnya Allah tidak menciptakan kita, hambaNya ini, sebagai makhluk yang lemah lagi mudah berputus asa. Kita sebagai manusia diciptakan Allah seba...",2


In [6]:
df["Sentiment"].value_counts()


0    9482
1    7661
2    3900
Name: Sentiment, dtype: int64

In [7]:
df = df[df["Sentiment"] != 1]
df["Sentiment"].value_counts()


0    9482
2    3900
Name: Sentiment, dtype: int64

In [8]:
df.loc[df["Sentiment"] == 0, "Sentiment"] = 1


In [9]:
df.loc[df["Sentiment"] == 2, "Sentiment"] = 0


In [10]:
# remove random rows with value 1 in sentiment column
# The amount value is the value we use so that the number of sentiment categories 1 is 6100
amount_value = (6100 - 9482) * (-1)
random_remove = np.random.choice(
    df[df["Sentiment"] == 1].index, size=amount_value, replace=False
)
df = df.drop(random_remove)


In [11]:
print(df["Sentiment"].value_counts())
print(df.shape)


1    6100
0    3900
Name: Sentiment, dtype: int64
(10000, 4)


In [12]:
df.to_csv(
    "../dataset/INA_TweetsPPKM_Labeled_FIX.csv",
    encoding="utf-8",
    sep="\t",
    index=False,
    header=True,
)
