In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint,Callback
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
import nltk
from nltk import tokenize
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,accuracy_score
# from sklearn import metrics 
# %matplotlib inline

def Get_Accuracy(y_true, y_pred): #Accuracy 准确率：分类器正确分类的样本数与总样本数之比 
#    accuracy = accuracy_score(y_true,y_pred,normalize = False) 
    accuracy = accuracy_score(y_true,y_pred)
    return accuracy

def Get_Precision_score(y_true, y_pred): #Precision：精准率 正确被预测的正样本(TP)占所有被预测为正样本(TP+FP)的比例. 
    precision = precision_score(y_true,y_pred,average='weighted')  
    return precision

def Get_Recall(y_true, y_pred): #Recall 召回率 正确被预测的正样本(TP)占所有真正 正样本(TP+FN)的比例.  
    Recall = recall_score(y_true,y_pred,average='weighted')  
    return Recall 
 
def Get_f1_score(y_true, y_pred): #F1-score: 精确率(precision)和召回率(Recall)的调和平均数  
    f1_score1 = f1_score(y_true,y_pred,average='weighted')  
    return f1_score1

class HierarchicalAttentionNetwork(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

class Metrics(Callback):
    def __init__(self):
        self.predict = []
        self.target = []
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
#         self.confusion_matrixs = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict,average='weighted')
        _val_recall = recall_score(val_targ, val_predict,average='weighted')
        _val_precision = precision_score(val_targ, val_predict,average='weighted')
#        self.confusion_matrixs = confusion_matrix(val_targ,val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print('- val_f1: %.4f - val_precision: %.4f - val_recall: %.4f'%(_val_f1, _val_precision, _val_recall))
        return

def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

def split_sentence(sentence):
    sentence_h = []
    sentence_s = []
    sentence_s = sentence.split(".")
    for i in range(len(sentence_s)):
        temp_s = sentence_s[i].split(",")
        for j in range(len(temp_s)):
            sentence_h.append(temp_s[j])
    return sentence_h

MAX_SENT_LENGTH = 500
MAX_SENTS = 50
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
DROP_OUT_LAYER = 0.2

# reading data
df1 = pd.read_excel('train.xls')
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)
SPLIT_LINE = df1.Deal_editorial.shape[0]
df2 = pd.read_excel('valid.xls')
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)
TEST_LINE = df2.Deal_editorial.shape[0]
df3 = pd.read_excel('test.xls')
df3 = df3.dropna()
df3 = df3.reset_index(drop=True)
df = df1.append(df2).append(df3)
# print('Shape of dataset ',df.shape)
macronum=sorted(set(df['Deal_status']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

df['Deal_status']=df['Deal_status'].apply(fun)# print(df.columns)

reviews = []
labels = []
texts = []

for i in range(len(list(df['Deal_editorial']))):
    texts.append(list(df['Deal_editorial'])[i].replace("\n","").replace(";",""))
#     sentences = tokenize.sent_tokenize(list(df['Deal_editorial'])[i].replace("\n","").replace(";",","))
    sentences = split_sentence(list(df['Deal_editorial'])[i].replace("\n","").replace(";",","))
    reviews.append(sentences)

for idx in df['Deal_status']:
    labels.append(idx)
    

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1
                    
word_index = tokenizer.word_index
print('No. of %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:SPLIT_LINE]
y_train = labels[:SPLIT_LINE]
x_val = data[SPLIT_LINE:SPLIT_LINE+TEST_LINE]
y_val = labels[SPLIT_LINE:SPLIT_LINE+TEST_LINE]
x_test = data[SPLIT_LINE+TEST_LINE:]
y_test = labels[SPLIT_LINE+TEST_LINE:]

embeddings_index = {}
f = open('../GloVe/vectors.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 300d.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(50)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)

review_input = Input(shape=(MAX_SENT_LENGTH, MAX_SENTS), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(50, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(50)(lstm_sentence)
preds = Dense(len(macronum), activation='softmax')(attn_sentence)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print("Hierachical LSTM")
model.summary()

cp=ModelCheckpoint('model_han_.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
metrics = Metrics()
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=3, batch_size=8,callbacks=[cp,metrics])

print(history)
#print("confusion--->>",metrics.confusion_matrixs)
print("f1_score-->>",metrics.val_f1s)
print("precision---->>",metrics.val_precisions)
print("recalls----->>",metrics.val_recalls)

y_predict1 = model.predict(x_test)

y_predict = (y_predict1>0.5)
accuracy = Get_Accuracy(y_test,y_predict)
print("HAN Accuracy_Score = %f"%accuracy) 
precision = Get_Precision_score(y_test,y_predict)
print("HAN Precision = %f"%precision)
recall = Get_Recall(y_test,y_predict)
print("HAN Recall = %f"%recall) 
f1_score1 = Get_f1_score(y_test,y_predict)
print("HAN F1-Score  = %f"%f1_score1)

Using Theano backend.


No. of 80404 unique tokens.
Shape of data tensor: (12688, 500, 500)
Shape of label tensor: (12688, 2)
Total 71291 word vectors in Glove 6B 300d.
Hierachical LSTM
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 500, 500)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 500, 100)          24231900  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 500, 100)          45300     
_________________________________________________________________
hierarchical_attention_netwo (None, 100)               5100      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 24,292,702
Trainable params: 24,282,502
Non-trainable params: 10,200
____________

KeyboardInterrupt: 

In [2]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : HAN',fontsize=16)
fig2.savefig('accuracy_han.png')
plt.show()

NameError: name 'history' is not defined

<Figure size 432x288 with 0 Axes>