In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import torch

In [2]:
df = pd.read_csv(r"../data/tweets.csv")

In [3]:
df.head()

Unnamed: 0,date,content,username,tweet_url,reply_count,retweet_count,like_count,verified,followers,content_clean,content_translated
0,2023-04-08 15:01:24+00:00,Dr Piutang THR\nDr Piutang Gaji\n Cr Pen...,txtdrakuntansi,https://twitter.com/txtdrakuntansi/status/1644...,1,6,11,False,59697,dr piutang thr dr piutang gaji cr pendapatan j...,from the receivables of the THR receivables fr...
1,2023-04-08 14:52:40+00:00,@islabellecoco @gojekindonesia Sedikit curhat ...,gummypark61,https://twitter.com/gummypark61/status/1644714...,1,1,1,False,14,sedikit curhat bahkan pas korona aja pernah ma...,a little vent even when corona just entered fu...
2,2023-04-08 14:45:03+00:00,rep dibawah sini yg mau spay thr receh 1.000 u...,yufada_,https://twitter.com/yufada_/status/16447129669...,26,1,1,False,113,rep dibawah sini yg mau spay thr receh 1000 un...,Rep below are those who want to spay THR DREH ...
3,2023-04-08 13:39:44+00:00,Selamat kepada :\n@emirahay82\n@0M_YANT0\n@Mis...,mindaart,https://twitter.com/mindaart/status/1644696530...,12,1,5,False,1551,selamat kepada pemenang ga thr masing2 100 spa...,Congratulations to the winner of GA THR each 1...
4,2023-04-08 12:51:00+00:00,ak supres pake link thr sama daget yh yg ke 4,haelovelychan,https://twitter.com/haelovelychan/status/16446...,18,2,35,False,57379,ak supres pake link thr sama daget yh yg ke 4,A suppress packed link


In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model.eval()

Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.31MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.26MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 36.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 747/747 [00:00<00:00, 206kB/s]
Downloading pytorch_model.bin: 100%|██████████| 499M/499M [02:10<00:00, 3.83MB/s] 


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [17]:
import torch

def predict_sentiment(model, tokenizer, text):
    # Tokenisasi teks dan tambahkan token khusus untuk awal dan akhir kalimat
    inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    
    # Mengambil input di dalam dictionary 'inputs'
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    # Beri tahu model bahwa tidak perlu melakukan perhitungan gradien
    with torch.no_grad():
        # Inferensi model untuk memperoleh output probabilitas kelas
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
    
    # Menerapkan argmax() pada dimensi kedua (dimensi dengan indeks 1) pada tensor logits
    predicted_class_idx = torch.argmax(logits, dim=1).item()

    if predicted_class_idx == 0:
        return "negative"
    elif predicted_class_idx == 1:
        return "neutral"
    else:
        return "positive"


In [20]:
# Membuat list untuk menyimpan hasil prediksi sentimen tiap tweet
sentiments = []

# Iterasi untuk setiap tweet dalam kolom 'tweets' di dalam pandas DataFrame df
for tweet in df['content_translated']:
    # Memanggil fungsi predict_sentiment() untuk memprediksi sentimen dari teks
    sentiment = predict_sentiment(model, tokenizer, tweet)
    sentiments.append(sentiment)

# Tambahkan kolom baru 'sentiment' ke dalam DataFrame
df['sentiment'] = sentiments


In [21]:
df.columns

Index(['date', 'content', 'username', 'tweet_url', 'reply_count',
       'retweet_count', 'like_count', 'verified', 'followers', 'content_clean',
       'content_translated', 'sentiment'],
      dtype='object')

In [22]:
df[['content_translated', 'sentiment']].head()

Unnamed: 0,content_translated,sentiment
0,from the receivables of the THR receivables fr...,neutral
1,a little vent even when corona just entered fu...,neutral
2,Rep below are those who want to spay THR DREH ...,negative
3,Congratulations to the winner of GA THR each 1...,positive
4,A suppress packed link,neutral


In [25]:
df.sentiment.value_counts()

sentiment
neutral     126
negative     42
positive     32
Name: count, dtype: int64

In [26]:
df.columns

Index(['date', 'content', 'username', 'tweet_url', 'reply_count',
       'retweet_count', 'like_count', 'verified', 'followers', 'content_clean',
       'content_translated', 'sentiment'],
      dtype='object')

In [27]:
df[['date',
'username',
'content_clean',
'content_translated',
'sentiment']].to_csv(r"../data/processed/tweets_label.csv", index=False)

: 