# VIETNAMESE HATE AND OFFENSIVE SPEECH DETECTOR
## (Data Preparation)

### Download VnCoreNLP

In [None]:
!mkdir -p models/vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar models/vncorenlp/
!mv vi-vocab models/vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr models/vncorenlp/models/wordsegmenter/

#### Check JAVA runtime
Since **VnCoreNLP requires JAVA** to run => make sure Java Runtime is already install under the working environment.

In [1]:
import os
print("JAVA_HOME:", os.getenv("JAVA_HOME"))
print("PATH:", os.getenv("PATH"))

JAVA_HOME: /usr/local/opt/openjdk
PATH: /usr/local/opt/openjdk/bin:/Users/trinb/opt/miniconda3/envs/base/bin:/usr/local/opt/openjdk/bin:/usr/local/opt/openjdk/bin:/Users/trinb/opt/miniconda3/bin:/Users/trinb/opt/miniconda3/condabin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin


### Import Libraries

In [2]:
# data manipulation, visualization
import pandas as pd
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import importlib

# custom modules
import utils.preprocess as preprocessor

In [3]:
importlib.reload(preprocessor)

<module 'utils.preprocess' from '/Users/trinb/works/vietnamese-hate-speech-detection/utils/preprocess.py'>

## I. Data Overview

In [4]:
# load data
train = pd.read_csv("data/vihsd/train.csv", index_col=False)
dev = pd.read_csv("data/vihsd/dev.csv", index_col=False)
test = pd.read_csv("data/vihsd/test.csv", index_col=False)

train.reset_index(drop=True)

print(len(train))
print(len(dev))
print(len(test))
     

24048
2672
6680


In [5]:
train.head(15)

Unnamed: 0,free_text,label_id
0,Em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...,0
1,ƒê√∫ng l√† b·ªçn m·∫Øt h√≠p l√≤ xo th·ª•t :))) b√™n vi·ªát n...,2
2,ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†,0
3,C√îN ƒê·ªí C·ª§C S√öC V√î NH√ÇN T√çNH ƒê·ªÄ NGHI VN. NH√Ä N∆Ø...,2
4,T·ª´ l√Ω thuy·∫øt ƒë·∫øn th·ª±c h√†nh l√† c·∫£ 1 c√¢u chuy·ªán ...,0
5,Coronavirus is manmade,0
6,ƒê·ªë ch√∫ng m nh·∫≠n ra ai,0
7,L√∫p l√∫p nh∆∞ ch√≥ .,1
8,ü•∞ü•∞ü•∞,0
9,"Th·∫ø m√† m√¨nh n√≥i m·∫•y th·∫±ng b·∫Øc k√¨, b·ªçn ƒë√≥ l·∫°i b...",2


In [6]:
train['label_id'].value_counts()

label_id
0    19886
2     2556
1     1606
Name: count, dtype: int64

In [7]:
dev['label_id'].value_counts()

label_id
0    2190
2     270
1     212
Name: count, dtype: int64

In [8]:
test['label_id'].value_counts()

label_id
0    5548
2     688
1     444
Name: count, dtype: int64

The datasets are heavily **imbalanced** with the majority of the data are `CLEAN` => this heavily affect the performance of the models. 

=> *rebalance the datasets with lesser `CLEAN` entries*. This might lead to worse knowledge delivered but guarantee the ability to classify classes more precisely.

In [9]:
train = preprocessor.reduce_class_size(train, 'label_id', 0, 3000)
dev = preprocessor.reduce_class_size(dev, 'label_id', 0, 350)
test = preprocessor.reduce_class_size(train, 'label_id', 0, 700)

In [10]:
train['label_id'].value_counts()

label_id
0    3000
2    2556
1    1606
Name: count, dtype: int64

# II. Data Preprocess

### Examine tokenizer

#### vncorenlp

In [11]:
preprocessor.tokenizer(train['free_text'][8])

[['Chuy·ªán',
  'ƒë√©o',
  'ƒë√¢u',
  'xa',
  'v·ªÅ',
  'ƒë·∫°o_ƒë·ª©c',
  'm·∫•y',
  'th·∫±ng',
  'ch√≥',
  'c√¥ng_an',
  'giao_th√¥ng',
  ':',
  's√°ng',
  'n√£y',
  'e',
  'ƒëi',
  'ti·ªÖn',
  'm·∫•y',
  'th·∫±ng',
  'b·∫°n',
  'ƒëi',
  'l√≠nh',
  '..',
  'b·ªçn',
  'giao_th√¥ng',
  'ƒëi',
  'd·∫πp',
  'ƒë∆∞·ªùng',
  'n√≥',
  'c·∫ßm',
  'g·∫≠y',
  's·∫Øt',
  'v·ª´a',
  'v·ª•t',
  'v·ª´a',
  'ch·ª≠i',
  'd√¢n',
  'nh∆∞',
  'con',
  '"',
  'b·ªë',
  'm√†y',
  'ƒë·∫≠p',
  'ch·∫øt',
  'c·ª•',
  'm√†y',
  'gi·ªù',
  ',',
  'ƒë·ªãt',
  'c·ª•',
  'm√†y',
  '"',
  'm√¨nh',
  'ƒë√©o',
  'ng·ªù',
  'lu√¥n',
  'ƒë·∫•y']]

#### ViTokenizer

In [12]:
preprocessor.tokenizer(train['free_text'][8], option=2)

'Chuy·ªán ƒë√©o ƒë√¢u xa v·ªÅ ƒë·∫°o_ƒë·ª©c m·∫•y th·∫±ng ch√≥ c√¥ng_an giao_th√¥ng : s√°ng n√£y e ƒëi ti·ªÖn m·∫•y th·∫±ng b·∫°n ƒëi l√≠nh . . b·ªçn giao_th√¥ng ƒëi d·∫πp ƒë∆∞·ªùng n√≥ c·∫ßm g·∫≠y s·∫Øt v·ª´a v·ª•t v·ª´a ch·ª≠i d√¢n nh∆∞ con " b·ªë m√†y ƒë·∫≠p ch·∫øt c·ª• m√†y gi·ªù , ƒë·ªãt c·ª• m√†y " m√¨nh ƒë√©o ng·ªù lu√¥n ƒë·∫•y'

### Examine filter stopwords

In [13]:
preprocessor.filter_stop_words(train['free_text'][4])

'H·∫£i Y·∫øn nh√¨n, ƒë√©o b·∫Øt ^^'

### Examine deEmojify

In [14]:
preprocessor.deEmojify(train['free_text'][1])

'cua ho√†ng ƒë·∫ø c∆° m√†'

In [21]:
preprocessor.deEmojify('aaa =]]]] aa')

'aaa  aa'

### Prepare data

In [15]:
X_train = train['free_text']
y_train = train['label_id'].values

X_dev = dev['free_text']
y_dev = dev['label_id'].values

X_test = test['free_text']
y_test = test['label_id'].values

In [16]:
train_X, train_y = preprocessor.pre_process_features(X_train, y_train, tokenized=True, lowercased = False)

In [17]:
val_X, val_y = preprocessor.pre_process_features(X_dev, y_dev, tokenized=True, lowercased = False)

In [18]:
test_X, test_y = preprocessor.pre_process_features(X_test, y_test, tokenized=True, lowercased= False)

### Export csv

In [19]:
data = { 'text': train_X, 'labels': train_y }

In [20]:
train_df = pd.DataFrame(data)

In [21]:
train_df

Unnamed: 0,text,labels
0,Cho_ph√©p ƒë√©o c√° s·ªëng t·∫ßng S√ÇU ch·∫øt .ƒêm,1
1,cua ho√†ng_ƒë·∫ø,0
2,G·∫ßn B√°c Ng·∫°n ∆∞i,0
3,Ch·ªãu ƒë·ªçc b√°o d√πm .Ti·ªÅn x√¢y t∆∞·ª£ng d√¢n ƒë·ªãa_ph∆∞∆°n...,2
4,"H·∫£i_Y·∫øn nh√¨n , ƒë√©o b·∫Øt ^ ^",2
...,...,...
7157,V·∫£i l√¥ng t·ª∑ n·ªï l·∫±n n·ªï l·ªën dell quan_t√¢m,2
7158,"qu·∫£n_l√≠ internet , nh·ªìi_s·ªç d√¢n ph√∫t_gi√¢y .D√¢n ...",2
7159,b·ªëc b·ªëc h·ª≠i h·ª≠i,1
7160,M√° m·∫∑c ƒëeo,1


In [22]:
train_df.to_csv('data/processed_train.csv')

#### Val dataset

In [23]:
val_data = { 'text': val_X, 'labels': val_y }

In [24]:
val_df = pd.DataFrame(val_data)

In [25]:
val_df.to_csv('data/processed_val.csv')

### Test dataset

In [27]:
test_data = { 'text': test_X, 'labels': test_y }

In [28]:
test_df = pd.DataFrame(test_data)

In [29]:
test_df.to_csv('data/processed_test.csv')