# Chapter 4.1 - 영어 텍스트 분류, P145 

## Load data

In [1]:
%reset

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

%matplotlib inline

DATA_PATH = "~/Workspace/Git_Repos/korean-hate-speech-detection/study/word2vec-nlp-tutorial/"

In [2]:
df_train = pd.read_csv(DATA_PATH + "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
df_test = pd.read_csv(DATA_PATH + "testData.tsv", header=0, delimiter="\t", quoting=3)

df_train.head()
df_test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


## Preprocessing

In [3]:
import re
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
# Regular Expression
# Remove HTML tag
# Lower case and split to each words
# Remove stopwords
# Make splited words to sentences

def preprocessing(text):
    regular_expression1 = "^[a-zA-Z0-9]"

    text = BeautifulSoup(text, 'html5lib').get_text()

    text = re.sub(regular_expression1, " ", text)

    words = text.lower().split()

    stop_words = stopwords.words('english')

    words = [w for w in words if not w in stop_words]

    clean_text = ' '.join(words)

    return clean_text

In [5]:
for i in tqdm(range(0, df_train.shape[0])):
    df_train['review'][i] = preprocessing(df_train['review'][i])

for i in tqdm(range(0, df_test.shape[0])):
    df_test['review'][i] = preprocessing(df_test['review'][i])

df_train.head()
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['review'][i] = preprocessing(df_train['review'][i])
100%|██████████| 25000/25000 [00:36<00:00, 680.79it/s]
100%|██████████| 25000/25000 [00:29<00:00, 846.92it/s]


Unnamed: 0,id,review
0,"""12311_10""","""naturally film who's main themes mortality, n..."
1,"""8348_2""","""this movie disaster within disaster film. ful..."
2,"""5828_4""","""all all, movie kids. saw tonight child loved ..."
3,"""7186_2""","""afraid dark left impression several different..."
4,"""12128_7""","""a accurate depiction small time mob life film..."


In [6]:
# Integer index

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['review'])
text_sequences = tokenizer.texts_to_sequences(df_train['review'])

print(text_sequences[0])

[354, 454, 78, 479, 10068, 111, 564, 2570, 132, 60, 956, 580, 149, 205, 19492, 205, 11560, 206, 190, 85, 18, 733, 2571, 138, 104, 12, 565, 4224, 190, 26, 240, 645, 2465, 1290, 11560, 79, 5005, 79, 710, 2, 288, 78, 13, 350, 1758, 541, 1237, 3394, 27613, 465, 872, 3474, 20, 496, 672, 1603, 19, 51972, 1953, 1087, 170, 403, 1823, 819, 2543, 4, 10068, 471, 78, 702, 77, 269, 108, 602, 10068, 27614, 27615, 136, 1, 10068, 364, 10, 52, 24, 364, 191, 12, 237, 176, 6, 699, 710, 2, 131, 329, 434, 815, 139, 16102, 3501, 1614, 639, 840, 11289, 1005, 11908, 899, 1329, 1574, 408, 10068, 261, 19, 643, 133, 10068, 23366, 2399, 14348, 840, 30551, 34, 30552, 386, 21, 45, 19493, 1603, 439, 11561, 190, 4107, 27613, 132, 695, 565, 87, 4, 10068, 1524, 436, 2315, 130, 2066, 2674, 639, 20, 75, 119, 5006, 5529, 297, 1359, 27616, 19, 639, 548, 831, 702, 700, 3, 472, 299, 504, 130, 683, 3395, 1248, 758, 53, 1265, 258, 1, 21, 4, 10068, 3, 566, 67, 464, 29, 21, 229, 709, 151, 264, 107, 7779, 672, 3572, 27613, 40808,

In [7]:
# Make vocabulary

vocab = tokenizer.word_index
vocab["<PAD>"] = 0

datas = {}
datas['vocab'] = vocab
datas['vocab_size'] = len(vocab)

print(datas['vocab'])
print(datas['vocab_size'])

": 88081, 'sarat': 88082, "chandra's": 88083, "shekar's": 88084, 'piyu': 88085, 'nris': 88086, 'ita': 88087, "rufus'": 88088, 'blabla': 88089, '230mph': 88090, 'idiosyncracies': 88091, 'assestment': 88092, "vulkin'": 88093, 'shure': 88094, '820': 88095, 'cloyingly': 88096, 'volvo': 88097, 'priuses': 88098, 'haavard': 88099, 'lilleheie': 88100, 'eazy': 88101, 'flava': 88102, "'rap'": 88103, 'rosenstraße': 88104, 'cameron´s': 88105, 'we´ve': 88106, 'girl´s': 88107, 'riemann´s': 88108, 'ritterkreuz': 88109, 'luftens': 88110, 'helte': 88111, 'nutz': 88112, 'beautifulest': 88113, 'principaly': 88114, 'translater': 88115, 'albertine': 88116, 'forme': 88117, 'signification': 88118, 'rousset': 88119, 'charlus': 88120, 'crystallizes': 88121, 'fallowing': 88122, 'pucking': 88123, "sarte's": 88124, 'clot': 88125, 'retrouvé': 88126, 'genette': 88127, 'holobrothel': 88128, 'planetscapes': 88129, 'charterers': 88130, 'tapers': 88131, 'connerey': 88132, 'psp': 88133, "till's": 88134, 'dyad': 88135, '

In [8]:
# Padding for train data 

MAX_SEQUENCE_LENGTH = 180

train_inputs = pad_sequences(text_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')
train_labels = np.array(df_train['sentiment'])

print(train_inputs.shape)
print(type(train_inputs))
print(train_inputs)

print(train_labels.shape)
print(type(train_labels))
print(train_labels)

(25000, 180)
<class 'numpy.ndarray'>
[[  872  3474    20 ... 19495   353  1501]
 [    6   267   234 ...     0     0     0]
 [12983 40811  3626 ...   778  1265  5484]
 ...
 [  138  3316    97 ...     0     0     0]
 [   17  1014   707 ...     0     0     0]
 [   14   122     1 ...     0     0     0]]
(25000,)
<class 'numpy.ndarray'>
[1 1 0 ... 0 0 1]


In [9]:
# Padding for test data 

tokenizer.fit_on_texts(df_test['review'])

text_sequences = tokenizer.texts_to_sequences(df_test['review'])
test_inputs = pad_sequences(text_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')
test_id = np.array(df_test['id'])

print(test_inputs.shape)
print(type(test_inputs))
print(test_inputs)

print(test_id.shape)
print(type(test_id))
print(test_id)

(25000, 180)
<class 'numpy.ndarray'>
[[1856    2  806 ...    0    0    0]
 [  17    1 1504 ...    0    0    0]
 [  72   72    1 ...    0    0    0]
 ...
 [  14  601    1 ...    0    0    0]
 [ 889  536  631 ...    0    0    0]
 [  17   20   91 ...    0    0    0]]
(25000,)
<class 'numpy.ndarray'>
['"12311_10"' '"8348_2"' '"5828_4"' ... '"2531_1"' '"7772_8"' '"11465_10"']


## RNN Model

In [11]:
# Seed 

import tensorflow as tf

SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

### Define hyperparameter

In [12]:
model_name = 'RNN_classifier_en'
BATCH_SIZE = 128
NUM_EPOCHS = 5
VALID_SPLIT = 0.1
MAX_LEN = train_inputs.shape[1]

kargs = {'model_name': model_name,
        'vocab_size': datas['vocab_size'],
        'embedding_dimension': 100,
        'dropout_rate': 0.2,
        'lstm_dimension': 150,
        'dense_dimension': 150,
        'output_dimension': 1}

### Model implementation