# Initialization

In [200]:
!pip install PySastrawi
!pip install nltk
!pip install transformers
!pip install pytorch-lightning==2.0.2
import nltk
nltk.download('popular');

^C


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Vanessa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Vanessa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Vanessa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Vanessa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Vanessa\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nlt

In [2]:
import random
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn.functional as F
import torch.nn as nn
from torchsummary import summary
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd

import pytorch_lightning as pl
from torchmetrics.functional import accuracy
from torchmetrics import F1Score, AUROC
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
seed_val = 1906350793
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [4]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Tersedia sejumlah %d GPU(s).' % torch.cuda.device_count())
    print('GPU yang akan digunakan:', torch.cuda.get_device_name(0))
else:
    print('Tidak mendukung GPU; hanya CPU')
    device = torch.device("cpu")

Tidak mendukung GPU; hanya CPU


# Loading datasets

## Load train dataset

In [375]:
df = pd.read_csv("hate-speech-classification/train.csv")
df.head()

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0,- disaat semua cowok berusaha melacak perhatia...,0,0,0,0,1,0,1,0,1,0
1,1,USER USER Kaum cebong kapir udah keliatan dong...,1,0,0,0,0,1,0,0,0,1
2,2,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,0,0,0,0,1,1,0,0,0,1
3,3,"Setidaknya gw punya jari tengah buat lu, sebel...",0,0,0,0,1,0,1,0,1,0
4,4,USER USER USER USER BANCI KALENG MALU GA BISA ...,0,0,0,1,0,0,1,0,1,0


In [376]:
LABEL_COLUMNS = df.columns.tolist()[2:12]
df[LABEL_COLUMNS]

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0,0,0,0,1,0,1,0,1,0
1,1,0,0,0,0,1,0,0,0,1
2,0,0,0,0,1,1,0,0,0,1
3,0,0,0,0,1,0,1,0,1,0
4,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
5556,0,0,0,0,1,0,1,0,1,0
5557,0,0,0,0,1,0,1,0,1,0
5558,0,0,0,0,1,1,0,0,0,1
5559,0,0,1,0,0,0,1,0,1,0


## Load test dataset

In [377]:
test_df = pd.read_csv("hate-speech-classification/test.csv")
test_df.head()

Unnamed: 0,No,Tweet
0,Test-1,"pemerintah sekarang pro asing, sudah tidak bis..."
1,Test-2,@EAndesla @SuaraAnies @aniesbaswedan @datuakma...
2,Test-3,namanya jg simpang susun bukan bundaran semang...
3,Test-4,@aheryawan Yg gak pake jilbab komunis? Megawat...
4,Test-5,"Ramos yang aku pandang idola dahulu, sekarang ..."


## Load other essentials

### Abusive terms

In [8]:
abusive_df = pd.read_csv("hate-speech-classification/abusive.csv")
abusive_list = abusive_df["ABUSIVE"].tolist()

In [9]:
print(abusive_list)

['alay', 'ampas', 'buta', 'keparat', 'anjing', 'anjir', 'babi', 'bacot', 'bajingan', 'banci', 'bandot', 'buaya', 'bangkai', 'bangsat', 'bego', 'bejat', 'bencong', 'berak', 'bisu', 'celeng', 'jancuk', 'bodoh', 'berengsek', 'budek', 'burik', 'jamban', 'cocot', 'congor', 'culun', 'cupu', 'dongok', 'dungu', 'edan', 'tai', 'ngewe', 'geblek', 'gembel', 'gila', 'goblok', 'iblis', 'idiot', 'jablay', 'jembud', 'jembut', 'jijik', 'kacrut', 'kafir', 'modar', 'kampang', 'kampret', 'kampungan', 'kimak', 'kontol', 'kunti', 'tuyul', 'kunyuk', 'mampus', 'memek', 'monyet', 'najis', 'nete', 'ngentot', 'noob', 'pecun', 'perek', 'sampah', 'sarap', 'setan', 'silit', 'bokong', 'sinting', 'sompret', 'sontoloyo', 'terkutuk', 'titit', 'pantat', 'tolol', 'udik', 'antek', 'asing', 'ateis', 'sitip', 'autis', 'picek', 'ayam kampus', 'bani kotak', 'bispak', 'bisyar', 'bokep', 'bong', 'cacat', 'cct', 'cebong', 'taplak', 'cungkring', 'gay', 'gembrot', 'gendut', 'hina', 'homo', 'komunis', 'koreng', 'krempeng', 'lengse

### Kamus alay terms

In [10]:
kamusalay_df = pd.read_csv("hate-speech-classification/new_kamusalay.csv", 
                           encoding = "ISO-8859-1",
                           names=["Alay", "Meaning"])
kamusalay_df.head()

Unnamed: 0,Alay,Meaning
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [11]:
kamusalay_dict = dict(zip(kamusalay_df.Alay, kamusalay_df.Meaning))

In [12]:
print(kamusalay_dict)

{'anakjakartaasikasik': 'anak jakarta asyik asyik', 'pakcikdahtua': 'pak cik sudah tua', 'pakcikmudalagi': 'pak cik muda lagi', 't3tapjokowi': 'tetap jokowi', '3x': 'tiga kali', 'aamiin': 'amin', 'aamiinn': 'amin', 'aamin': 'amin', 'aammiin': 'amin', 'abis': 'habis', 'abisin': 'habiskan', 'acau': 'kacau', 'achok': 'ahok', 'ad': 'ada', 'adek': 'adik', 'adl': 'adalah', 'adlah': 'adalah', 'adlh': 'adalah', 'ado': 'ada', 'aduhh': 'aduh', 'aer': 'air', 'afdol': 'afdal', 'agamaataualqur': 'agama alquran', 'agm': 'agama', 'agma': 'agama', 'ahaha': 'haha', 'ahahaha': 'haha', 'ahehehehe': 'hehe', 'ahir': 'akhir', 'ahirnya': 'akhirnya', 'ahk': 'ahok', 'ahlamdulillah': 'alhamdulillah', 'ahli2': 'para ahli', 'ahlusunnah': 'ahlus sunah', 'ahmaddani': 'ahmad dhani', 'aho': 'ahok', 'ahoax': 'ahok', 'ahoaxx': 'ahok', 'ahog': 'ahok', 'ahokataudjarot': 'ahok djarot', 'ahokbebanijokowi': 'ahok beban jokowi', 'ahokbtp': 'ahok basuki tjahaja purnama', 'ahokditolakwarga': 'ahok ditolak warga', 'ahokdjarot':

# Preprocessing

### Preprocessing methods -> normalize, normalize_alay

In [381]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stop_factory = StopWordRemoverFactory()
stopword = stop_factory.create_stop_word_remover()

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

from nltk import stem

def preprocess(dataframe):

    #Lower case, extra spaces, special characters
    def normalize(text):
        try:
            # consider order, don't edit sequence
            res = text.lower().strip() # Mengubah uppercase menjadi lowercase dan melakukan trimming pada teks
            res = re.sub("\B\@\w+", "pengguna", res)
            res = re.sub("[^\w\s\-]+", " ", res) # Mengubah tanda baca kecuali strip menjadi spasi
            res = re.sub("_", " ", res) # Mengubah garis bawah menjadi spasi
            res = re.sub("[\bxf\b]+", " ", res)
            res = re.sub("\s+", " ", res) # Menghilangkan spasi berlebih
            res = re.sub("\d", "", res)
            return res
        except:
            print("NoneType found")

    #Use alay dict to convert terms
    def normalize_alay(text):
        try:
            #split text
            temp = text.split()
            res = []
            for word in temp:
                res.append(kamusalay_dict.get(word,word))
            res = ' '.join(res)
            return res
        except:
            print("NoneType found")


    #For stop word removal and stemming
    def stop_and_stem(text):
        try:
            #res = stopword.remove(text)
            res = stemmer.stem(text)
            return res
        except:
            print("NoneType found")
    
    dataframe['Tweet_normalized'] = dataframe['Tweet'].map(lambda text: normalize(text))
    dataframe['Tweet_normalized'] = dataframe['Tweet_normalized'].map(lambda text: normalize_alay(text))
    #dataframe['Tweet_normalized'] = dataframe['Tweet_normalized'].map(lambda text: stop_and_stem(text))

In [382]:
preprocess(df)
df

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong,Tweet_normalized
0,0,- disaat semua cowok berusaha melacak perhatia...,0,0,0,0,1,0,1,0,1,0,- di saat semua cowok berusaha melacak perhati...
1,1,USER USER Kaum cebong kapir udah keliatan dong...,1,0,0,0,0,1,0,0,0,1,pengguna pengguna kaum cebong kafir sudah keli...
2,2,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,0,0,0,0,1,1,0,0,0,1,pengguna ya bani taplak dan kawan kawan
3,3,"Setidaknya gw punya jari tengah buat lu, sebel...",0,0,0,0,1,0,1,0,1,0,setidaknya gue punya jari tengah buat kamu seb...
4,4,USER USER USER USER BANCI KALENG MALU GA BISA ...,0,0,0,1,0,0,1,0,1,0,pengguna pengguna pengguna pengguna banci kale...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556,5556,"USER Pak Recep..............anda salah, itu gu...",0,0,0,0,1,0,1,0,1,0,pengguna pak resep anda salah itu gubernur pak...
5557,5557,brengsek itu orang terbuat dr apa bikin gue be...,0,0,0,0,1,0,1,0,1,0,berengsek itu orang terbuat dari apa bikin gue...
5558,5558,Kapolda Babi! Biadap dan Bodoh! Gak punya otak...,0,0,0,0,1,1,0,0,0,1,kepala kepolisian daerah babi biadab dan bodoh...
5559,5559,USER jangan asal ngomong ndasmu. congor lu yg ...,0,0,1,0,0,0,1,0,1,0,pengguna jangan asal berbicara ndasmu congor k...


## List abusive words

In [383]:
df["Abusive_terms"] = df['Tweet_normalized'].str.split().apply(set(abusive_list).intersection)
df["Abusive_terms"]

0                       {bego}
1       {dungu, cebong, kafir}
2               {bani, taplak}
3                      {bacot}
4                      {banci}
                 ...          
5556                        {}
5557               {berengsek}
5558             {babi, bodoh}
5559          {congor, anjing}
5560                    {onta}
Name: Abusive_terms, Length: 5561, dtype: object

In [384]:
df

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong,Tweet_normalized,Abusive_terms
0,0,- disaat semua cowok berusaha melacak perhatia...,0,0,0,0,1,0,1,0,1,0,- di saat semua cowok berusaha melacak perhati...,{bego}
1,1,USER USER Kaum cebong kapir udah keliatan dong...,1,0,0,0,0,1,0,0,0,1,pengguna pengguna kaum cebong kafir sudah keli...,"{dungu, cebong, kafir}"
2,2,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,0,0,0,0,1,1,0,0,0,1,pengguna ya bani taplak dan kawan kawan,"{bani, taplak}"
3,3,"Setidaknya gw punya jari tengah buat lu, sebel...",0,0,0,0,1,0,1,0,1,0,setidaknya gue punya jari tengah buat kamu seb...,{bacot}
4,4,USER USER USER USER BANCI KALENG MALU GA BISA ...,0,0,0,1,0,0,1,0,1,0,pengguna pengguna pengguna pengguna banci kale...,{banci}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556,5556,"USER Pak Recep..............anda salah, itu gu...",0,0,0,0,1,0,1,0,1,0,pengguna pak resep anda salah itu gubernur pak...,{}
5557,5557,brengsek itu orang terbuat dr apa bikin gue be...,0,0,0,0,1,0,1,0,1,0,berengsek itu orang terbuat dari apa bikin gue...,{berengsek}
5558,5558,Kapolda Babi! Biadap dan Bodoh! Gak punya otak...,0,0,0,0,1,1,0,0,0,1,kepala kepolisian daerah babi biadab dan bodoh...,"{babi, bodoh}"
5559,5559,USER jangan asal ngomong ndasmu. congor lu yg ...,0,0,1,0,0,0,1,0,1,0,pengguna jangan asal berbicara ndasmu congor k...,"{congor, anjing}"


# Modelling w/ pretrained BERT

ref: <br>
https://kyawkhaung.medium.com/multi-label-text-classification-with-bert-using-pytorch-47011a7313b9 <br>
https://jovian.ml/kyawkhaung/1-titles-only-for-medium/v/1&cellId=10

## Variables

In [385]:
MAX_LEN = 16
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

## Training the model

### Convert all labels to 1 list

In [145]:
df["HS_Target"] = np.where(df["HS_Individual"] == 1, 1, 2)
df["HS_Level"] = np.where(df["HS_Weak"] == 1, 1, np.where(df["HS_Moderate"] == 1, 2, 3))

In [146]:
df['labels'] = df[['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Individual', 'HS_Group', 'HS_Weak', 'HS_Moderate', 'HS_Strong']].values.tolist()
#df['labels'] = df[['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Level', 'HS_Target']].values.tolist()
df

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong,Tweet_normalized,HS_Target,HS_Level,labels,Abusive_terms
0,0,- disaat semua cowok berusaha melacak perhatia...,0,0,0,0,1,0,1,0,1,0,- di saat semua cowok berusaha melacak perhati...,2,2,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]",{bego}
1,1,USER USER Kaum cebong kapir udah keliatan dong...,1,0,0,0,0,1,0,0,0,1,pengguna pengguna kaum cebong kafir sudah keli...,1,3,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1]","{dungu, cebong, kafir}"
2,2,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,0,0,0,0,1,1,0,0,0,1,pengguna ya bani taplak dan kawan kawan,1,3,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]","{bani, taplak}"
3,3,"Setidaknya gw punya jari tengah buat lu, sebel...",0,0,0,0,1,0,1,0,1,0,setidaknya gue punya jari tengah buat kamu seb...,2,2,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]",{bacot}
4,4,USER USER USER USER BANCI KALENG MALU GA BISA ...,0,0,0,1,0,0,1,0,1,0,pengguna pengguna pengguna pengguna banci kale...,2,2,"[0, 0, 0, 1, 0, 0, 1, 0, 1, 0]",{banci}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556,5556,"USER Pak Recep..............anda salah, itu gu...",0,0,0,0,1,0,1,0,1,0,pengguna pak resep anda salah itu gubernur pak...,2,2,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]",{}
5557,5557,brengsek itu orang terbuat dr apa bikin gue be...,0,0,0,0,1,0,1,0,1,0,berengsek itu orang terbuat dari apa bikin gue...,2,2,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]",{berengsek}
5558,5558,Kapolda Babi! Biadap dan Bodoh! Gak punya otak...,0,0,0,0,1,1,0,0,0,1,kepala kepolisian daerah babi biadab dan bodoh...,1,3,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]","{babi, bodoh}"
5559,5559,USER jangan asal ngomong ndasmu. congor lu yg ...,0,0,1,0,0,0,1,0,1,0,pengguna jangan asal berbicara ndasmu congor k...,2,2,"[0, 0, 1, 0, 0, 0, 1, 0, 1, 0]","{congor, anjing}"


In [147]:
df2 = df[['Tweet_normalized', 'labels']].copy()
df2

Unnamed: 0,Tweet_normalized,labels
0,- di saat semua cowok berusaha melacak perhati...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
1,pengguna pengguna kaum cebong kafir sudah keli...,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1]"
2,pengguna ya bani taplak dan kawan kawan,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
3,setidaknya gue punya jari tengah buat kamu seb...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
4,pengguna pengguna pengguna pengguna banci kale...,"[0, 0, 0, 1, 0, 0, 1, 0, 1, 0]"
...,...,...
5556,pengguna pak resep anda salah itu gubernur pak...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
5557,berengsek itu orang terbuat dari apa bikin gue...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
5558,kepala kepolisian daerah babi biadab dan bodoh...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
5559,pengguna jangan asal berbicara ndasmu congor k...,"[0, 0, 1, 0, 0, 0, 1, 0, 1, 0]"


### Training and Validation dataset

In [346]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['Tweet_normalized']
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [349]:
#Train:Validation = 8:2
train_size = 0.8
train_dataset = df2.sample(frac=train_size,random_state=200)
valid_dataset = df2.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df2.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(valid_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(valid_dataset, tokenizer, MAX_LEN)

FULL Dataset: (5561, 2)
TRAIN Dataset: (4449, 2)
TEST Dataset: (1112, 2)


In [350]:
train_dataset

Unnamed: 0,Tweet_normalized,labels
0,pengguna tidak aneh kata kita adminnya sih ade...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
1,pengguna apa sih kamu tertawa mulu bangsat,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
2,rt pengguna ayo bangsa indonesia bersihkan etn...,"[0, 1, 0, 0, 0, 1, 0, 1, 0, 0]"
3,pengguna alah sombong benar punuk onta,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
4,mereka ini lah yang menggiring opini presiden ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
...,...,...
4444,pengguna sama halnya dengan cina di indonesia ...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 1]"
4445,demo tentang rohingya hampir diseluruh negara ...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
4446,pengguna tetap saja salah dimata bani kampret ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
4447,rt pengguna pengguna pemimpin rakyat bukan yan...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"


In [152]:
valid_dataset

Unnamed: 0,Tweet_normalized,labels
0,pengguna ya bani taplak dan kawan kawan,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
1,pengguna pengguna ahok the best abaikan kaum k...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
2,pengguna pengguna sih otak tempurung adalah ti...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
3,pengguna pengguna pengguna pengguna gue menyar...,"[1, 0, 0, 0, 0, 0, 1, 0, 1, 0]"
4,ganti presiden secara konstitusional hak setia...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
...,...,...
1107,mengomongnya begini nyatanya meminta periode t...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"
1108,yang begini adalah lembaga swadaya masyarakat ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
1109,alquran kitab sampah dari zakar naik silakan d...,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1]"
1110,pengguna goblok bayangkan saja kalau janin itu...,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]"


### Training and Validation dataloader

In [351]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

In [352]:
len(training_loader)

140

In [353]:
len(validation_loader)

35

### Making the model

In [325]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('indolem/indobert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.5)
        self.l3 = torch.nn.Linear(768, 10)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### Loss function & Optimizer

In [354]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

### Essentials for loading model (checkpoint and the best one)

In [64]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [51]:
import shutil, sys   
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

### Training

In [355]:
#to use as global variables
val_targets=[]
val_outputs=[] 

def train_model(start_epochs,  n_epochs, valid_loss_min_input, 
                training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
  
  
   
  # initialize tracker for minimum validation loss
  valid_loss_min = valid_loss_min_input 
   
 
  for epoch in range(start_epochs, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        print('yyy epoch', batch_idx)
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if batch_idx%5000==0:
            print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))


  return model

In [356]:
checkpoint_path = 'current_checkpoint_bert2.pt'
best_model = 'best_model_bert2.pt'

#Load the model
#model, optimizer, start_epoch, valid_loss_min = load_ckp(checkpoint_path, model, optimizer)

trained_model = train_model(1, 4, 4, training_loader, validation_loader, model, 
                      optimizer,checkpoint_path,best_model)

############# Epoch 1: Training Start   #############
yyy epoch 0
Epoch: 1, Training Loss:  0.1977432817220688
before loss data in training 0.1977432817220688 0
after loss data in training 0.1977432817220688 0.1977432817220688
yyy epoch 1
before loss data in training 0.12761002779006958 0.1977432817220688
after loss data in training 0.12761002779006958 0.16267665475606918
yyy epoch 2
before loss data in training 0.13862916827201843 0.16267665475606918
after loss data in training 0.13862916827201843 0.15466082592805228
yyy epoch 3
before loss data in training 0.2564800977706909 0.15466082592805228
after loss data in training 0.2564800977706909 0.18011564388871193
yyy epoch 4
before loss data in training 0.24129369854927063 0.18011564388871193
after loss data in training 0.24129369854927063 0.19235125482082366
yyy epoch 5
before loss data in training 0.13116015493869781 0.19235125482082366
after loss data in training 0.13116015493869781 0.18215273817380268
yyy epoch 6
before loss data in

### Validation prediction

In [357]:
val_preds = (np.array(val_outputs) > 0.5).astype(int)
val_preds

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [358]:
roc_auc = metrics.roc_auc_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"ROC AUC Score = {roc_auc}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

ROC AUC Score = 0.7664050107030385
F1 Score (Micro) = 0.7931461137565147
F1 Score (Macro) = 0.6773379150131306


## Testing with test dataset

### Preprocessing

In [386]:
preprocess(test_df)
test_df["Abusive_terms"] = test_df['Tweet_normalized'].str.split().apply(set(abusive_list).intersection)
test_df

Unnamed: 0,No,Tweet,Tweet_normalized,Abusive_terms
0,Test-1,"pemerintah sekarang pro asing, sudah tidak bis...",pemerintah sekarang pro asing sudah tidak bisa...,{asing}
1,Test-2,@EAndesla @SuaraAnies @aniesbaswedan @datuakma...,pengguna pengguna pengguna pengguna pengguna p...,"{dungu, picek, cebong, gila}"
2,Test-3,namanya jg simpang susun bukan bundaran semang...,namanya juga simpang susun bukan bundaran sema...,{kampungan}
3,Test-4,@aheryawan Yg gak pake jilbab komunis? Megawat...,pengguna yang tidak pakai jilbab komunis megaw...,{komunis}
4,Test-5,"Ramos yang aku pandang idola dahulu, sekarang ...",ramos yang aku pandang idola dahulu sekarang s...,"{babi, jijik}"
...,...,...,...,...
145,Test-146,banyak kader-kader PDIP dan NASDEM anti Islam ...,banyak kader-kader partai demokrasi indonesia ...,{}
146,Test-147,RT @liputan6dotcom Ternyata Budha pun teroris,rt pengguna ternyata buddha pun teroris,{}
147,Test-148,@jonruginting @SurYosodipuro biasa mereka itul...,pengguna pengguna biasa mereka itulah para ant...,"{antek, komunis}"
148,Test-149,@kimjunh91 @ohsehunnr ASTAGA BOLOT BANGET DIA,pengguna pengguna astaga bolot banget dia,{}


### Testing dataset

In [361]:
test_dataset = test_df.drop(['No','Tweet','Abusive_terms'], axis=1)
test_dataset

Unnamed: 0,Tweet_normalized
0,pemerintah sekarang pro asing sudah tidak bisa...
1,pengguna pengguna pengguna pengguna pengguna p...
2,namanya juga simpang susun bukan bundaran sema...
3,pengguna yang tidak pakai jilbab komunis megaw...
4,ramos yang aku pandang idola dahulu sekarang s...
...,...
145,banyak kader-kader partai demokrasi indonesia ...
146,rt pengguna ternyata buddha pun teroris
147,pengguna pengguna biasa mereka itulah para ant...
148,pengguna pengguna astaga bolot banget dia


In [362]:
class TestDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['Tweet_normalized']
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [363]:
testing_set = TestDataset(test_dataset, tokenizer, MAX_LEN)

In [364]:
testing_set[0]

{'ids': tensor([    3,  1990,  2338,  3572,  3779,  1798,  1580,  1777, 18162,     4,
             0,     0,     0,     0,     0,     0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

### Testing dataloader

In [365]:
testing_loader = DataLoader(testing_set, **test_params)

In [366]:
len(testing_loader)

5

### Testing

In [367]:
def testing(loader):
    model.eval()
   
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs

In [368]:
test_outputs = testing(testing_loader)
test_outputs = (np.array(test_outputs) >= 0.5).astype(int)

In [369]:
test_outputs

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 1]])

In [370]:
preds_df = pd.DataFrame(test_outputs, columns =['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Individual', 'HS_Group', 'HS_Weak', 'HS_Moderate', 'HS_Strong']) 
preds_df.head()

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0,0,0,0,1,1,0,0,0,1
1,0,0,0,0,1,0,1,0,1,0
2,0,0,0,0,1,0,1,0,1,0
3,0,1,0,0,0,0,1,0,1,0
4,0,0,0,0,1,0,1,0,1,0


### Save testing predictions

In [371]:
df_output = pd.concat([test_df, preds_df], axis=1).drop(['Tweet', 'Tweet_normalized', 'Abusive_terms'], axis=1)
df_output

Unnamed: 0,No,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,0,0,0,0,1,1,0,0,0,1
1,Test-2,0,0,0,0,1,0,1,0,1,0
2,Test-3,0,0,0,0,1,0,1,0,1,0
3,Test-4,0,1,0,0,0,0,1,0,1,0
4,Test-5,0,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
145,Test-146,1,0,0,0,0,1,0,0,0,1
146,Test-147,1,0,0,0,0,1,0,0,0,1
147,Test-148,1,1,0,0,0,1,0,0,0,1
148,Test-149,0,0,1,1,0,0,1,0,1,0


In [372]:
df_output.to_csv('hate-speech-classification/tested_lr2.csv', index=False)