In [3]:
import pandas as pd

data = pd.read_csv('sentiment_data_v1.csv')
temp = [len(str(ele)) for ele in data.text.tolist()] 
res = 0 if len(temp) == 0 else (float(sum(temp)) / len(temp))  
print(res)

93.2101


In [4]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Dropout, LSTM


class TextRNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=2,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        x = LSTM(128)(embedding)  # LSTM or GRU

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [5]:
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

max_features = 5000
maxlen = 128
batch_size = 64
embedding_dims = 50
epochs = 100

data = pd.read_csv('sentiment_data_v1.csv').dropna().reset_index(drop=True)
data["text"] = data["text"].astype('string')

x_train,x_test,y_train,y_test = train_test_split(data["text"], data["polarity"],test_size=0.15)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data["text"])

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

encoder = OneHotEncoder()

y_train = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))
y_test= encoder.fit_transform(np.asarray(y_test).reshape(-1,1))

model = TextRNN(maxlen, max_features, embedding_dims).get_model()
model.summary()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 50)           250000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 341,906
Trainable params: 341,906
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 8493 samples, validate on 1499 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.callbacks.History at 0x119e064a508>

In [7]:
import pickle

# saving
with open('sentiment_tokenizer_v1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
model.save("sentiment_model_v1.h5")

In [28]:
input_sentences = [data[data['polarity']==1].reset_index(drop=True).text[1070]]
print(input_sentences)

[' 그는 “ 문 정부의 코로나 대응을 폭주라고 이야기하면 어떻게 하자는 것이냐 ” 며 “ 야당이 생각하는 코로나19 대처는 지금보다 더 물렁물렁하게 , 느슨하게 하겠다는 이야기 아니냐 ” 고 비판했다 ']


In [29]:
from keras.preprocessing import sequence
from keras.models import load_model
import pickle 

with open('sentiment_tokenizer_v1.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('sentiment_model_v1.h5')
predict = model.predict(sequence.pad_sequences(tokenizer.texts_to_sequences(input_sentences), maxlen=128))
for i, pred in enumerate(predict):
    print(input_sentences[i], pred.argmax(), pred)

 그는 “ 문 정부의 코로나 대응을 폭주라고 이야기하면 어떻게 하자는 것이냐 ” 며 “ 야당이 생각하는 코로나19 대처는 지금보다 더 물렁물렁하게 , 느슨하게 하겠다는 이야기 아니냐 ” 고 비판했다  1 [9.3366508e-04 9.9906987e-01]
