# Recurrent Neural Network based Hate Speech Language Model for Korean Hate Speech Detection

## 1. Data Collection 

### 1.1. Scraping Raw Hate Speech Text Data

In [None]:
from selenium import webdriver
import time
import pandas as pd

list_size = 10000
url = 'https://www.ilbe.com/list/ilbe?listSize={}&sub=best&listStyle=list'.format(list_size)
driver = webdriver.Chrome(executable_path = 'D:\chromedriver_win32\chromedriver.exe')
driver.get(url)
time.sleep(5)

url_list = []
post_list = driver.find_elements_by_xpath('//ul[contains(@class, \'board-body\')]//li[not(@id) and not(@class)]//span[contains(@class, \'title\')]//a[contains(@class, \'subject\')]')
for post_num in range(len(post_list)):
    print(post_num, post_list[post_num].get_attribute('href'))
    url_list.append(post_list[post_num].get_attribute('href'))
    pd.Series(url_list).to_frame(name='url').to_csv('url_list_v2.csv')

In [None]:
from selenium import webdriver
import time
import pandas as pd

hate_speech_list = pd.read_csv('hate_speech_raw_data.csv')['hate_speech'].tolist()
#hate_speech_list = []
url_list = pd.read_csv('url_list_v2.csv')['url'].tolist()

i = 1185
driver = webdriver.Chrome(executable_path = 'D:\chromedriver_win32\chromedriver.exe')
for url_num in range(i, len(url_list)):
    driver.get(url_list[url_num])
    comment_list = driver.find_elements_by_xpath('//span[contains(@class, \'comment-box\')]')
    for comment in comment_list:
        hate_speech_list.append(comment.text)
        print(url_num, comment.text) 
    pd.Series(hate_speech_list).to_frame(name='hate_speech').drop_duplicates().reset_index(drop=True).to_csv('hate_speech_raw_data.csv', index=False)           
    time.sleep(5)

In [4]:
import pandas as pd 

data = pd.read_csv('hate_speech_raw_data.csv')
data.tail()

Unnamed: 0,hate_speech
129446,전라씨발 전라컹새끼들ㅋㅋㅋ
129447,ㅈㄹㄷ
129448,5월 영상을 지금 들먹이노 ㅋㅋㅋㅋ ㅇㅂ
129449,경찰대 나와서 엘리트출신도 아니도 그냥 경찰딱지달고 나부랭이들은 저런 결정권 주면 ...
129450,옜날꺼 ㅁㅈㅎ


## 2. Data Preprocessing

### 2.1. Text Preprocessing with KoNLPY

In [None]:
from konlpy.tag import Okt  
import pandas as pd 

okt =  Okt()
data = pd.read_csv('hate_speech_raw_data.csv')

for i in range(len(data['hate_speech'])):
    
    sentence = ''
    try:
        for word in okt.nouns(data['hate_speech'][i]):
            sentence = sentence + ' ' + word
    except:
        pass
    data['hate_speech'][i] = sentence
    print(i ,data['hate_speech'][i])
    data.to_csv('hate_speech_data.csv', index=False)

### 2.2. Topic Modeling

In [3]:
import pandas as pd

text = pd.read_csv('hate_speech_data.csv').fillna(' ').replace(to_replace=['존나', '진짜', '사람', '나라', '생각', '이건', '씨발', '시발', '일베', '익명', '병신', '재앙', '문재인', '게이', '이기', '댓글', '정보', '새끼', '지랄', '개새끼', '그냥', '보고', '아주', '얼굴', '한국', '우리', '지금', '대통령', '홍어', '분탕'], value="",regex=True)
text.tail()

Unnamed: 0,hate_speech
129446,전라 전라 컹
129447,
129448,영상
129449,경찰대 엘리트 출신 경찰 달 나부랭이 저런 결정 줫 뭔가 결정
129450,옜날꺼


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features= 10000) # 상위 10,000개의 단어를 보존 
X = vectorizer.fit_transform(text['hate_speech'])
X.shape 

(129451, 10000)

In [5]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=5,random_state=777,max_iter=1).fit(X)
nmf_top = nmf_model.fit_transform(X)
terms = vectorizer.get_feature_names()

In [18]:
def get_topics(components, feature_names, n=50):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(nmf_model.components_,terms)

Topic 1: [('전라도', 4.96), ('출신', 0.09), ('경상도', 0.08), ('민주당', 0.07), ('고향', 0.07), ('때문', 0.07), ('대한민국', 0.06), ('서울', 0.05), ('지역', 0.05), ('광주', 0.05), ('세상', 0.04), ('지지율', 0.04), ('인간', 0.04), ('팩트', 0.04), ('특징', 0.04), ('사투리', 0.03), ('좌빨', 0.03), ('문제', 0.03), ('어디', 0.03), ('하나', 0.03), ('부모', 0.03), ('사회', 0.03), ('김대중', 0.03), ('부산', 0.03), ('혐오', 0.03), ('저런', 0.03), ('지지', 0.03), ('역시', 0.03), ('쓰레기', 0.03), ('북한', 0.03), ('독립', 0.03), ('사절', 0.03), ('정말', 0.03), ('이제', 0.03), ('동네', 0.02), ('통수', 0.02), ('거지', 0.02), ('버러지', 0.02), ('고향이', 0.02), ('카르텔', 0.02), ('폭동', 0.02), ('대구', 0.02), ('과학', 0.02), ('자기', 0.02), ('기업', 0.02), ('유전자', 0.02), ('자체', 0.02), ('조선족', 0.02), ('정도', 0.02), ('원래', 0.02)]
Topic 2: [('좌파', 3.04), ('우파', 0.99), ('소리', 0.97), ('국민', 0.83), ('미국', 0.79), ('개돼지', 0.77), ('북한', 0.77), ('정권', 0.7), ('박근혜', 0.68), ('하나', 0.64), ('수준', 0.6), ('좌빨', 0.57), ('보수', 0.57), ('선동', 0.56), ('정도', 0.56), ('때문', 0.55), ('문제', 0.55), ('대가리', 0.53), ('저런', 0.51),

In [3]:
import pandas as pd

text_df = pd.read_csv('hate_speech_topic_dataset.csv', index_col=0)
text_df['문장'] = text_df['문장'].astype('str').replace({'0': "특정 지역에 대한 차별적 발언", '1': "정치적 성향이 다른 사람들에 대한 혐오 및 왜곡", '2':  "다른 나라에 대한 차별적 발언", '3': "여성 및 성소수자에 대한 혐오 및 왜곡"})
text_df.tail()

Unnamed: 0,문장,혐오 여부
23589,신후게이야 ㅠㅠ,0
23590,최순실 300조 안민돌새끼는 진짜 사기죄로 처넣어야함\n\n일본의 무력에 굴복해서 ...,0
23591,경상도일 확률 1026%,3
23592,개쌍도가주도하는질서전라도가기생충처럼나라를살,1
23593,아무래도 우파로 간 김미균을 시기하는\n좌파들의 공작이 시작 된듯 하다,1


In [None]:
import pandas as pd
import numpy as np

text = pd.read_csv('hate_speech_raw_data.csv', index_col=0)
text['topic'] = np.nan

for i in range(len(nmf_top)):
    text['topic'][i] = np.argmax(nmf_top[i])
    print(i)

In [26]:
text[text['topic']!=4.0].reset_index().to_csv('hate_speech_data_cleaned.csv')

## 3. Language Modeling

### 3.1. Training Data Preperation

In [27]:
import pandas as pd

hate_text = pd.read_csv('hate_speech_data_cleaned.csv')
hate_text['topic'].value_counts()

1.0    96009
0.0    19805
2.0     5800
3.0     5795
Name: topic, dtype: int64

In [2]:
import pandas as pd

random_text = pd.read_csv('ratings.txt', sep='\t', quoting=3)
random_text = random_text[random_text['label'] == 1].reset_index(drop=True)
random_text.tail()

Unnamed: 0,id,document,label
99995,3793074,귀신보다 사람이 얼마나 무서운가를 보여주는.. 메시지까지 담고있는 드라마~최고!,1
99996,3025658,이라크 및 아랍과의 전쟁을 그린 모든 영화 중에서 가장 최고!!,1
99997,7698359,값으로 환산할 수 없을 만큼 귀엽고 황홀한 캐릭터 ㅠㅠ,1
99998,7068653,짱,1
99999,3206900,파괴지왕에서 장학우 콘서트 티켓을 얻기위한... ㅋ,1


In [None]:
import pandas as pd

hate_text = pd.read_csv('hate_speech_data_cleaned.csv')
random_text = random_text[random_text['label'] == 1].reset_index(drop=True)

dataset_dir = 'hate_speech_binary_dataset.csv'
dataframe = pd.DataFrame(columns=['문장', '혐오 여부'])
dataframe.to_csv(dataset_dir, index=False)
dataframe = pd.read_csv(dataset_dir)

count = 0 

for q in hate_text['hate_speech']:
    data = pd.DataFrame({'문장': q, '혐오 여부': 1}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)
    
for t in random_text['document'][:len(hate_text['hate_speech'])]:
    data = pd.DataFrame({'문장': t, '혐오 여부': 0}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)


In [25]:
import pandas as pd 
from sklearn.utils import shuffle

dataset_dir = 'hate_speech_topic_dataset.csv'
df = shuffle(pd.read_csv('hate_speech_topic_dataset.csv', index_col=0)).reset_index(drop=True)
df.to_csv(dataset_dir)
df['혐오 여부'].value_counts()

1    6000
0    5999
2    5800
3    5795
Name: 혐오 여부, dtype: int64

In [11]:
import pandas as pd

hate_text = pd.read_csv('hate_speech_data_cleaned.csv', index_col=0)[:90000]
hate_text.tail()

Unnamed: 0,hate_speech,topic
89995,그러면 강아지랑 그것도 하셨네,1.0
89996,목줄안하고있는 개새끼들 죽여도 무죄아님?,1.0
89997,목줄 안해요 내가 안해요 내가 안하겠다는건데 누가 시킨다는거요,1.0
89998,왼쪽이 닥터드레냐?,1.0
89999,@앙마의속삭임 Dr.Dog,0.0


In [13]:
data = {'문장':hate_text['hate_speech'].tolist(), '혐오 여부':[0] * len(hate_text['hate_speech'].tolist())}
df_0 = pd.DataFrame(data) 
df_0.tail()

Unnamed: 0,문장,혐오 여부
89995,그러면 강아지랑 그것도 하셨네,0
89996,목줄안하고있는 개새끼들 죽여도 무죄아님?,0
89997,목줄 안해요 내가 안해요 내가 안하겠다는건데 누가 시킨다는거요,0
89998,왼쪽이 닥터드레냐?,0
89999,@앙마의속삭임 Dr.Dog,0


In [14]:
import pandas as pd

random_text = pd.read_csv('ratings.txt', sep='\t', quoting=3)
random_text = random_text[random_text['label'] == 1].reset_index(drop=True)
random_text.tail()

Unnamed: 0,id,document,label
99995,3793074,귀신보다 사람이 얼마나 무서운가를 보여주는.. 메시지까지 담고있는 드라마~최고!,1
99996,3025658,이라크 및 아랍과의 전쟁을 그린 모든 영화 중에서 가장 최고!!,1
99997,7698359,값으로 환산할 수 없을 만큼 귀엽고 황홀한 캐릭터 ㅠㅠ,1
99998,7068653,짱,1
99999,3206900,파괴지왕에서 장학우 콘서트 티켓을 얻기위한... ㅋ,1


In [15]:
data = {'문장':random_text['document'].tolist(), '혐오 여부':[1] * len(random_text['document'].tolist())}
df_1 = pd.DataFrame(data) 
df_1.tail()

Unnamed: 0,문장,혐오 여부
99995,귀신보다 사람이 얼마나 무서운가를 보여주는.. 메시지까지 담고있는 드라마~최고!,1
99996,이라크 및 아랍과의 전쟁을 그린 모든 영화 중에서 가장 최고!!,1
99997,값으로 환산할 수 없을 만큼 귀엽고 황홀한 캐릭터 ㅠㅠ,1
99998,짱,1
99999,파괴지왕에서 장학우 콘서트 티켓을 얻기위한... ㅋ,1


In [19]:
df= pd.concat([df_0, df_1])
df.reset_index(drop=True).tail()

Unnamed: 0,문장,혐오 여부
189995,귀신보다 사람이 얼마나 무서운가를 보여주는.. 메시지까지 담고있는 드라마~최고!,1
189996,이라크 및 아랍과의 전쟁을 그린 모든 영화 중에서 가장 최고!!,1
189997,값으로 환산할 수 없을 만큼 귀엽고 황홀한 캐릭터 ㅠㅠ,1
189998,짱,1
189999,파괴지왕에서 장학우 콘서트 티켓을 얻기위한... ㅋ,1


In [21]:
import pandas as pd 
from sklearn.utils import shuffle

dataframe = shuffle(df).reset_index(drop=True)
dataframe.tail()

Unnamed: 0,문장,혐오 여부
189995,원작을 읽을 때 이런 건 절대 영상화하기 힘들다고 생각했는데 벤휘쇼의 연기와 더불어...,1
189996,케석대 어깨 올라간거봐라 ㅋㅋ,0
189997,@김짜꾸 day and night\n\nround the clock\n\nwitho...,0
189998,로버트다우니주니어를 좋아해서 봤는데너무재밌게 봤던영화생각없이 볼때 딱좋음,1
189999,@익명_146173 개지랄병 병신좌좀새끼ㅋㅋㅋㅋ,0


In [22]:
dataframe['혐오 여부'].value_counts()

1    100000
0     90000
Name: 혐오 여부, dtype: int64

In [23]:
dataframe.to_csv('hate_speech_binary_dataset.csv', index=False)

In [24]:
import pandas as pd 
from sklearn.utils import shuffle


df = shuffle(pd.read_csv('hate_speech_binary_dataset.csv', index_col=0)).reset_index(drop=True)
df.to_csv(dataset_dir)
df['혐오 여부'].value_counts()

1    100000
0     90000
Name: 혐오 여부, dtype: int64

In [None]:
import pandas as pd

hate_text = pd.read_csv('hate_speech_data_cleaned.csv')
hate_text_0 = hate_text[hate_text['topic'] == 0].reset_index(drop=True)
hate_text_1 = hate_text[hate_text['topic'] == 1].reset_index(drop=True)
hate_text_2 = hate_text[hate_text['topic'] == 2].reset_index(drop=True)
hate_text_3 = hate_text[hate_text['topic'] == 3].reset_index(drop=True)
random_text = random_text[random_text['label'] == 1].reset_index(drop=True)

dataset_dir = 'hate_speech_dataset.csv'
dataframe = pd.DataFrame(columns=['문장', '혐오 여부'])
dataframe.to_csv(dataset_dir, index=False)
dataframe = pd.read_csv(dataset_dir)

count = 0 

for q in hate_text_0['hate_speech'][:6000]:
    data = pd.DataFrame({'문장': q, '혐오 여부': 0}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)
    
for w in hate_text_1['hate_speech'][:6000]:
    data = pd.DataFrame({'문장': w, '혐오 여부':1}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)

for e in hate_text_2['hate_speech']:#[:1900]:
    data = pd.DataFrame({'문장': e, '혐오 여부': 2}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)
    
for r in hate_text_3['hate_speech']:#[:1900]:
    data = pd.DataFrame({'문장': r, '혐오 여부': 3}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)

for t in random_text['document'][:8000]:
    data = pd.DataFrame({'문장': t, '혐오 여부': 4}, index=[0])
    dataframe = dataframe.append(data, ignore_index=True)
    dataframe.drop_duplicates().dropna().reset_index(drop=True).to_csv(dataset_dir)
    count = count+1
    print(count)


In [5]:
import pandas as pd 
from sklearn.utils import shuffle

dataset_dir = 'hate_speech_dataset.csv'
df = shuffle(pd.read_csv('hate_speech_dataset.csv', index_col=0)).reset_index(drop=True)
df.to_csv(dataset_dir)
df.tail()

Unnamed: 0,문장,혐오 여부
31502,일게이 퀄리티 살아잇노,1
31503,"제가 서부 영화를 좋아하는 데, 짱이네요.",4
31504,최고의 영화!! 말이 필요없습니다.,4
31505,관심주지마라,1
31506,아무도 부정할 수 없는 우리네 모습,4


In [6]:
import pandas as pd

random_text = pd.read_csv('ratings.txt', sep='\t', quoting=3)
random_text = random_text[random_text['label'] == 1].reset_index(drop=True)
random_text = random_text[]

Unnamed: 0,id,document,label
99995,3793074,귀신보다 사람이 얼마나 무서운가를 보여주는.. 메시지까지 담고있는 드라마~최고!,1
99996,3025658,이라크 및 아랍과의 전쟁을 그린 모든 영화 중에서 가장 최고!!,1
99997,7698359,값으로 환산할 수 없을 만큼 귀엽고 황홀한 캐릭터 ㅠㅠ,1
99998,7068653,짱,1
99999,3206900,파괴지왕에서 장학우 콘서트 티켓을 얻기위한... ㅋ,1


In [2]:
import pandas as pd

data = pd.read_csv('hate_speech_topic_dataset.csv', index_col=0)
data[data['혐오 여부']==0].reset_index(drop=True).to_csv('hate_speech_topic_region.csv', index=False)

### 3.2. Recurrent Neural Network

In [None]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Dropout, LSTM


class TextRNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=2,
                 last_activation='softmax'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        x = LSTM(128)(embedding)  # LSTM or GRU

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model


In [2]:
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

max_features = 5000
maxlen = 128
batch_size = 64
embedding_dims = 50
epochs = 100

data = pd.read_csv('hate_speech_binary_dataset.csv').dropna().reset_index(drop=True)
data["문장"] = data["문장"].astype('string')

x_train,x_test,y_train,y_test = train_test_split(data["문장"], data["혐오 여부"],test_size=0.15)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data["문장"])

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

encoder = OneHotEncoder()

y_train = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))
y_test= encoder.fit_transform(np.asarray(y_test).reshape(-1,1))

model = TextRNN(maxlen, max_features, embedding_dims).get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 50)           250000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 341,906
Trainable params: 341,906
Non-trainable params: 0
_________________________________________________________________


In [3]:
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))


Train on 161495 samples, validate on 28500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.callbacks.History at 0x1c001983188>