## Data Preprocessing

In [1]:
import pandas as pd
df = pd.read_csv("../Data/lstm.csv")
df.head()

Unnamed: 0,paragraph,category
0,dishplace is located in sunnyvale downtown the...,food
1,service can be slower during busy hours but ou...,food
2,portions are huge both french toast and their ...,food
3,we started with apps going the chicken and waf...,food
4,the biscuits and gravy was too salty two peopl...,food


In [2]:
df.category.unique()

array(['food', 'sports'], dtype=object)

In [3]:
# 데이터에 사용된 중복 없는 전체 단어 갯수 파악 
results = set()
df['paragraph'].str.lower().str.split().apply(results.update)
vocab_size = len(results)
vocab_size

536

In [4]:
# 단어를 숫자로 인코딩 
paragraphs = df['paragraph'].to_list()
paragraphs[0:5]

['dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait',
 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations',
 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table',
 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition',
 'the biscuits and gravy was too salty two people in my group had the gravy and all thoug

In [5]:
from tensorflow import keras 
keras.utils.set_random_seed(1) # keras에서 set seed 하기 

In [6]:
# One Hot Encoding (단어로 되어있는 걸 숫자로 바꾸기)
encoded_paragraphs = [keras.preprocessing.text.one_hot(paragraph, vocab_size) for paragraph in paragraphs]
encoded_paragraphs

[[532,
  439,
  377,
  331,
  313,
  496,
  139,
  439,
  146,
  337,
  77,
  1,
  396,
  349,
  290,
  111,
  268,
  380,
  270,
  163,
  349,
  457,
  478,
  45,
  350,
  68,
  493,
  496,
  380,
  191,
  250,
  275,
  376,
  446,
  278,
  228,
  380,
  89,
  382,
  403,
  217,
  493,
  319,
  469,
  64,
  320,
  186,
  526,
  398,
  77,
  71,
  380,
  383],
 [20,
  290,
  111,
  534,
  163,
  403,
  478,
  396,
  300,
  254,
  311,
  320,
  68,
  257,
  455,
  479,
  448,
  129,
  83],
 [212,
  343,
  274,
  113,
  124,
  469,
  68,
  254,
  454,
  22,
  343,
  382,
  411,
  254,
  124,
  469,
  439,
  75,
  160,
  421,
  169,
  468,
  52,
  218,
  235,
  448,
  250,
  380,
  270,
  132,
  526,
  343,
  166,
  68,
  208,
  136,
  380,
  383,
  160,
  335,
  275,
  278,
  314],
 [530,
  196,
  35,
  223,
  72,
  77,
  67,
  68,
  160,
  465,
  68,
  67,
  186,
  77,
  173,
  89,
  78,
  68,
  77,
  186,
  89,
  411,
  283,
  22,
  509,
  389,
  77,
  186,
  164,
  398,
  472,
  458,


In [7]:
# 데이터에서 가장 긴 문장의 단어 갯수 확인 (최대값 찾는 로직)
max_length = 0 
for row in df['paragraph'] :
    if len(row.split(" "))  > max_length :
        max_length = len(row.split(" "))
print(max_length)

91


In [8]:
# zero padding(sequence padding) 넣어서 문장별 길이 맞추기 
padded_paragraphs_encoding = keras.preprocessing.sequence.pad_sequences(encoded_paragraphs, maxlen = max_length, padding = 'post')
# - padding :'post' - 0값 뒤로 빼는 것 

padded_paragraphs_encoding

array([[532, 439, 377, ...,   0,   0,   0],
       [ 20, 290, 111, ...,   0,   0,   0],
       [212, 343, 274, ...,   0,   0,   0],
       ...,
       [355, 287, 275, ...,   0,   0,   0],
       [396,  77,  29, ...,   0,   0,   0],
       [311, 496,  77, ...,   0,   0,   0]], dtype=int32)

In [9]:
# 분류항목(food, sports)를 수치로 변경
categories = df['category'].to_list()

def category_encode(category) :
    if category == "food":
        return [1,0]
    else : 
        return [0,1]

In [10]:
encoded_category = [category_encode(category) for category in categories]
encoded_category

[[1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1]]

In [11]:
# Feature 확인 
print(encoded_paragraphs[0])

[532, 439, 377, 331, 313, 496, 139, 439, 146, 337, 77, 1, 396, 349, 290, 111, 268, 380, 270, 163, 349, 457, 478, 45, 350, 68, 493, 496, 380, 191, 250, 275, 376, 446, 278, 228, 380, 89, 382, 403, 217, 493, 319, 469, 64, 320, 186, 526, 398, 77, 71, 380, 383]


In [12]:
len(encoded_paragraphs[19])

73

--- 
## RNN Model

In [13]:
model = keras.Sequential()
# 문맥 생성 단계 

model.add(keras.layers.Embedding(vocab_size, 5, input_length = max_length))
# Embedding : One Hot Encoding 하기 (지금 다 숫자로 되어있으니까)
# vocab_size : 전체 데이터 피쳐 갯수 
# 5 : embedding layer(인덱스를 받아 5차원 벡터의 임베딩을 출력). Hyper Parameter. 
# ---> 단어 하나를 5개 차원으로 나눠서 임베딩 해보겠다는 것 (근데 그냥 5개 품사 정도로 크게 나눠서 (5형식?) 그걸로 한거임. 컴퓨터는 뭔 의민지 당연히 모르고)
# max_length : 한 문장당 단어 최대 갯수 (아까 맞춘거.)

# LSTM 
model.add(keras.layers.LSTM(64)) 

# 분류단계 
model.add(keras.layers.Dense(32, activation = 'relu')) # LSTM 반값 쓰기 
model.add(keras.layers.Dense(2, activation = 'softmax')) # 문장의 확률값을 구하고자 함 

In [14]:
# 컴파일 
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')

In [15]:
# 데이터셋 리스트 -> 넘파이 배열 변환 (길이 맞췄으니까 이제 넘파이 사용 간ㅇ)
import numpy as np 
train_X = np.array(padded_paragraphs_encoding)
train_Y = np.array(encoded_category)

In [16]:
# 학습하기 
model.fit(train_X, train_Y, batch_size=10, epochs=50)

Epoch 1/50


2022-08-09 16:04:33.039080: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x177d08be0>