# VIETNAMESE HATE AND OFFENSIVE SPEECH DETECTOR
## (Data Preparation)

#### Download VnCoreNLP

In [None]:
!mkdir -p models/vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar models/vncorenlp/
!mv vi-vocab models/vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr models/vncorenlp/models/wordsegmenter/

#### Check JAVA runtime
Since **VnCoreNLP requires JAVA** to run => make sure Java Runtime is already install under the working environment.

In [1]:
import os
print("JAVA_HOME:", os.getenv("JAVA_HOME"))
print("PATH:", os.getenv("PATH"))

JAVA_HOME: /usr/local/opt/openjdk
PATH: /usr/local/opt/openjdk/bin:/Users/trinb/opt/miniconda3/envs/base/bin:/usr/local/opt/openjdk/bin:/usr/local/opt/openjdk/bin:/Users/trinb/opt/miniconda3/bin:/Users/trinb/opt/miniconda3/condabin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin


### Import Libraries

In [43]:
# data manipulation, visualization
import pandas as pd
import numpy as np
import importlib

# custom modules
import utils.preprocess as preprocessor

In [2]:
importlib.reload(preprocessor)

<module 'utils.preprocess' from '/Users/trinb/works/vietnamese-hate-speech-detection/utils/preprocess.py'>

## I. Data Overview

### VIHSD Dataset

In [3]:
# load data
train = pd.read_csv("data/vihsd/train.csv", index_col=False)
dev = pd.read_csv("data/vihsd/dev.csv", index_col=False)
test = pd.read_csv("data/vihsd/test.csv", index_col=False)

train.reset_index(drop=True)

print(len(train))
print(len(dev))
print(len(test))
     

24048
2672
6680


In [5]:
train.head(5)

Unnamed: 0,free_text,label_id
0,Em được làm fan cứng luôn rồi nè ❤️ reaction q...,0
1,Đúng là bọn mắt híp lò xo thụt :))) bên việt n...,2
2,Đậu Văn Cường giờ giống thằng sida hơn à,0
3,CÔN ĐỒ CỤC SÚC VÔ NHÂN TÍNH ĐỀ NGHI VN. NHÀ NƯ...,2
4,Từ lý thuyết đến thực hành là cả 1 câu chuyện ...,0


In [6]:
train['label_id'].value_counts()

label_id
0    19886
2     2556
1     1606
Name: count, dtype: int64

In [7]:
dev['label_id'].value_counts()

label_id
0    2190
2     270
1     212
Name: count, dtype: int64

In [8]:
test['label_id'].value_counts()

label_id
0    5548
2     688
1     444
Name: count, dtype: int64

The datasets are heavily **imbalanced** with the majority of the data are `CLEAN` => this heavily affect the performance of the models. 

=> *rebalance the datasets with lesser `CLEAN` entries*. This might lead to worse knowledge delivered but guarantee the ability to classify classes more precisely.

In [9]:
# for later use
train_raw = train.copy()

In [10]:
train = preprocessor.reduce_class_size(train, 'label_id', 0, 3000)
dev = preprocessor.reduce_class_size(dev, 'label_id', 0, 350)
test = preprocessor.reduce_class_size(test, 'label_id', 0, 700)

In [11]:
train['label_id'].value_counts()

label_id
0    3000
2    2556
1    1606
Name: count, dtype: int64

In [12]:
test['label_id'].value_counts()

label_id
0    700
2    688
1    444
Name: count, dtype: int64

### VITHSD Dataset

In [13]:
train1 = pd.read_csv("data/vithsd/train.csv", index_col=False)
dev1 = pd.read_csv("data/vithsd/dev.csv", index_col=False)
test1 = pd.read_csv("data/vithsd/test.csv", index_col=False)

train.reset_index(drop=True)

print(len(train1))
print(len(dev1))
print(len(test1))
     

7000
1201
1800


In [14]:
# Drop unnecessary columns
train1 = train1.drop(columns=['Unnamed: 0'])
dev1 = dev1.drop(columns=['Unnamed: 0'])
test1 = test1.drop(columns=['Unnamed: 0'])

In [15]:
train1 = preprocessor.annotate_overall_hate(train1)
dev1 = preprocessor.annotate_overall_hate(dev1)
test1 = preprocessor.annotate_overall_hate(test1)

In [16]:
train1 = train1.rename(columns={'content': 'free_text'})
dev1 = dev1.rename(columns={'content': 'free_text'})
test1 = test1.rename(columns={'content': 'free_text'})

In [17]:
test1['label_id'].value_counts()

label_id
0    866
2    576
1    358
Name: count, dtype: int64

In [18]:
dev1['label_id'].value_counts()

label_id
0    621
2    373
1    207
Name: count, dtype: int64

In [19]:
train1.head()

Unnamed: 0,free_text,label_id
0,Chào nhà báo vũ hoàng Lân. Chào danh hài vũ tr...,1
1,Anh Thi nào đi đăng ký rủ anh nhé.,0
2,ở mỹ ns cái lồn gì k đc,0
3,"Thôn làm vậy là do thôn lầm, thôn lầm nên dân ...",0
4,Phạm Văn Lộc fuho trời nắng chóng mặt hả idol,0


### Combine datasets for richer dataset

In [32]:
train = pd.concat([train, train1, test1, dev1])

In [33]:
train['label_id'].value_counts()

label_id
0    7899
2    5799
1    3465
Name: count, dtype: int64

In [34]:
train = preprocessor.reduce_class_size(train, 'label_id', 0, 6000)

In [35]:
train['label_id'].value_counts()

label_id
0    6000
2    5799
1    3465
Name: count, dtype: int64

### Binary Label dataset
To differentiate offensive and clean text only.

In [20]:
train_binary = pd.concat([train_raw, train1, test1, dev1])

In [21]:
train_binary['label_id'].value_counts()

label_id
0    24785
2     5799
1     3465
Name: count, dtype: int64

In [22]:
test_binary = test.copy()
dev_binary = dev.copy()

In [23]:
test_binary.loc[test_binary['label_id'] == 2, 'label_id'] = 1
dev_binary.loc[dev_binary['label_id'] == 2, 'label_id'] = 1
train_binary.loc[train_binary['label_id'] == 2, 'label_id'] = 1

In [24]:
train_binary['label_id'].value_counts()

label_id
0    24785
1     9264
Name: count, dtype: int64

In [25]:
train_binary = preprocessor.reduce_class_size(train_binary, 'label_id', 0, 11000)

In [26]:
train_binary['label_id'].value_counts()

label_id
0    11000
1     9264
Name: count, dtype: int64

# II. Data Preprocess

### Examine tokenizer

#### vncorenlp

In [22]:
preprocessor.tokenizer(train['free_text'][8])

[['Chuyện',
  'đéo',
  'đâu',
  'xa',
  'về',
  'đạo_đức',
  'mấy',
  'thằng',
  'chó',
  'công_an',
  'giao_thông',
  ':',
  'sáng',
  'nãy',
  'e',
  'đi',
  'tiễn',
  'mấy',
  'thằng',
  'bạn',
  'đi',
  'lính',
  '..',
  'bọn',
  'giao_thông',
  'đi',
  'dẹp',
  'đường',
  'nó',
  'cầm',
  'gậy',
  'sắt',
  'vừa',
  'vụt',
  'vừa',
  'chửi',
  'dân',
  'như',
  'con',
  '"',
  'bố',
  'mày',
  'đập',
  'chết',
  'cụ',
  'mày',
  'giờ',
  ',',
  'địt',
  'cụ',
  'mày',
  '"',
  'mình',
  'đéo',
  'ngờ',
  'luôn',
  'đấy']]

#### ViTokenizer

In [12]:
preprocessor.tokenizer(train['free_text'][8], option=2)

'Chuyện đéo đâu xa về đạo_đức mấy thằng chó công_an giao_thông : sáng nãy e đi tiễn mấy thằng bạn đi lính . . bọn giao_thông đi dẹp đường nó cầm gậy sắt vừa vụt vừa chửi dân như con " bố mày đập chết cụ mày giờ , địt cụ mày " mình đéo ngờ luôn đấy'

### Examine filter stopwords

In [13]:
preprocessor.filter_stop_words(train['free_text'][4])

'Hải Yến nhìn, đéo bắt ^^'

### Examine deEmojify

In [14]:
preprocessor.deEmojify(train['free_text'][1])

'cua hoàng đế cơ mà'

In [21]:
preprocessor.deEmojify('aaa =]]]] aa')

'aaa  aa'

### Prepare data

In [36]:
X_train = train['free_text']
y_train = train['label_id'].values

X_dev = dev['free_text']
y_dev = dev['label_id'].values

X_test = test['free_text']
y_test = test['label_id'].values

In [37]:
train_X, train_y = preprocessor.pre_process_features(X_train, y_train, tokenized=True, lowercased = False)

In [38]:
val_X, val_y = preprocessor.pre_process_features(X_dev, y_dev, tokenized=True, lowercased = False)

In [39]:
test_X, test_y = preprocessor.pre_process_features(X_test, y_test, tokenized=True, lowercased= False)

### Binary DF

In [28]:
X_train = train_binary['free_text']
y_train = train_binary['label_id'].values

X_dev = dev_binary['free_text']
y_dev = dev_binary['label_id'].values

X_test = test_binary['free_text']
y_test = test_binary['label_id'].values

val_X, val_y = preprocessor.pre_process_features(X_dev, y_dev, tokenized=True, lowercased = False)
test_X, test_y = preprocessor.pre_process_features(X_test, y_test, tokenized=True, lowercased= False)

In [33]:
# Process seperately as the request to vncorenlp is limited
train_X1, train_y1 = preprocessor.pre_process_features(X_train[0:10000], y_train[0:10000], tokenized=True, lowercased = False)
train_X2, train_y2 = preprocessor.pre_process_features(X_train[10000:], y_train[10000:], tokenized=True, lowercased = False)

train_X = train_X1 + train_X2
train_y = np.concatenate((train_y1, train_y2)) 

# III. Export csv

In [50]:
data = { 'text': train_X, 'labels': train_y }

In [51]:
train_df = pd.DataFrame(data)

In [52]:
train_df

Unnamed: 0,text,labels
0,From cà_mau ( Phú_Hưng ),0
1,Anh kêu lương 7-8 triệu cua Hoàng_Đế,0
2,Mia_Mêlanô đù đóng tập vs nè,0
3,"Có kẻ vĩ cuồng , dọn cỗ quan_tài đẹp nhất , lă...",1
4,Huy_Le bún riêu haha,0
...,...,...
20259,"Cái lỗ mũi lỗ trâu ngột_ngạt , hít khí trời ng...",1
20260,Ủa tiền dịch,1
20261,Má đẹp trời,0
20262,@ Phan ngô mày clip minh hiếu ko đụng linda co...,0


In [54]:
train_df.to_csv('data/binary_train.csv')

#### Val dataset

In [56]:
val_data = { 'text': val_X, 'labels': val_y }

In [57]:
val_df = pd.DataFrame(val_data)

In [58]:
val_df.to_csv('data/binary_val.csv')

### Test dataset

In [59]:
test_data = { 'text': test_X, 'labels': test_y }

In [60]:
test_df = pd.DataFrame(test_data)

In [61]:
test_df.to_csv('data/binary_test.csv')