In [184]:
import re
import string

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [185]:
import torch
from transformers import BertForSequenceClassification, AutoTokenizer

In [186]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = torch.load("depression_classifier.pt")

In [187]:
sentence = "Aku udah merasa cukup sama hidup ini. Lebih baik aku mati"

text = sentence

# preprocess
text=str(text).lower() #x``
text=re.sub('\d+', '', text) #removes numbers
text=re.sub('\[.*?\]', '', text) #removes HTML tags
text=re.sub('https?://\S+|www\.\S+', '', text) #removes url
text=re.sub(r"["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", "", text) #removes emojis
text=re.sub('[%s]' % re.escape(string.punctuation),'',text) #removes punctuations
text = [stemmer.stem(word) for word in text.split(' ')] #apply stemmer
text = ' '.join(text) ## join the words seperated by spaces

In [188]:
text

'aku udah rasa cukup sama hidup ini lebih baik aku mati'

In [189]:
print('Original: ', text)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(text))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)))

print('Length of the token:',len(tokenizer.tokenize(text)))

Original:  aku udah rasa cukup sama hidup ini lebih baik aku mati
Tokenized:  ['aku', 'udah', 'rasa', 'cukup', 'sama', 'hidup', 'ini', 'lebih', 'baik', 'aku', 'mati']
Token IDs:  [304, 2137, 1214, 724, 500, 669, 92, 216, 342, 304, 1861]
Length of the token: 11


In [190]:
encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 33,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        
                   )

In [191]:
encoded_dict

{'input_ids': tensor([[   2,  304, 2137, 1214,  724,  500,  669,   92,  216,  342,  304, 1861,
            3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [192]:
input_ids = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']

In [193]:
input_ids

tensor([[   2,  304, 2137, 1214,  724,  500,  669,   92,  216,  342,  304, 1861,
            3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [194]:
attention_masks

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [195]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [196]:
b_input_ids = input_ids.to(device)
b_input_mask = attention_masks.to(device)

In [197]:
# forward pass
with torch.inference_mode():
    output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

logits = output.logits

In [198]:
logits

tensor([[-0.3448,  0.3836]], device='cuda:0')

In [199]:
logits[0][0]

tensor(-0.3448, device='cuda:0')

In [200]:
logits[0][1]

tensor(0.3836, device='cuda:0')

In [201]:
if logits[0][1] > logits[0][0]:
    print("Depresi")
else:
    print("Not depresi")

Depresi
