In [21]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano' # Why theano why not
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint,Callback
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,accuracy_score
# from sklearn import metrics 
import matplotlib.pyplot as plt
plt.switch_backend('agg')
#%matplotlib inline

def Get_Accuracy(y_true, y_pred): #Accuracy 准确率：分类器正确分类的样本数与总样本数之比 
#    accuracy = accuracy_score(y_true,y_pred,normalize = False) 
    accuracy = accuracy_score(y_true,y_pred)
    return accuracy

def Get_Precision_score(y_true, y_pred): #Precision：精准率 正确被预测的正样本(TP)占所有被预测为正样本(TP+FP)的比例. 
    precision = precision_score(y_true,y_pred,average='weighted')  
    return precision

def Get_Recall(y_true, y_pred): #Recall 召回率 正确被预测的正样本(TP)占所有真正 正样本(TP+FN)的比例.  
    Recall = recall_score(y_true,y_pred,average='weighted')  
    return Recall 
 
def Get_f1_score(y_true, y_pred): #F1-score: 精确率(precision)和召回率(Recall)的调和平均数  
    f1_score1 = f1_score(y_true,y_pred,average='weighted')  
    return f1_score1

class Metrics(Callback):
    def __init__(self):
        self.predict = []
        self.target = []
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
#        self.confusion_matrixs = np.array()

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict,average='weighted')
        _val_recall = recall_score(val_targ, val_predict,average='weighted')
        _val_precision = precision_score(val_targ, val_predict,average='weighted')
#         array,confusion_matrixs = confusion_matrix(val_targ,val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        self.predict.append(val_predict)
        self.target.append(val_targ)
#         print('----------------------------------------------')
#         print(array,confusion_matrixs)
#         print('----------------------------------------------')
        print('- val_f1: %.4f - val_precision: %.4f - val_recall: %.4f'%(_val_f1, _val_precision, _val_recall))
        return

def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

MAX_SEQUENCE_LENGTH = 1024
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
DROP_OUT_LAYER = 0.2

# reading data
df1 = pd.read_excel('train.xls')
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)
SPLIT_LINE = df1.Deal_editorial.shape[0]
df2 = pd.read_excel('valid.xls')
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)
TEST_LINE = df2.Deal_editorial.shape[0]
df3 = pd.read_excel('test.xls')
df3 = df3.dropna()
df3 = df3.reset_index(drop=True)
df = df1.append(df2).append(df3)
# print('Shape of dataset ',df.shape)
# print(df.columns)

macronum=sorted(set(df['Deal_status']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

df['Deal_status']=df['Deal_status'].apply(fun)

texts = []
for i in range(len(list(df['Deal_editorial']))):
    texts.append(list(df['Deal_editorial'])[i].replace("\n",""))
labels = []

# for idx in range(df.Deal_editorial.shape[0]):
#     text = BeautifulSoup(df.Deal_editorial[idx])
#     texts.append(clean_str(str(text.get_text().encode())))

for idx in df['Deal_status']:
    labels.append(idx)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:SPLIT_LINE]
y_train = labels[:SPLIT_LINE]
x_val = data[SPLIT_LINE:SPLIT_LINE+TEST_LINE]
y_val = labels[SPLIT_LINE:SPLIT_LINE+TEST_LINE]
x_test = data[SPLIT_LINE+TEST_LINE:]
y_test = labels[SPLIT_LINE+TEST_LINE:]

embeddings_index = {}
f = open('../GloVe/vectors.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 300d.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_pool1 = Dropout(DROP_OUT_LAYER)(l_pool1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_pool2 = Dropout(DROP_OUT_LAYER)(l_pool2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_pool3 = Dropout(DROP_OUT_LAYER)(l_pool3)
l_flat = Flatten()(l_pool3)
l_flat = Dropout(DROP_OUT_LAYER)(l_flat)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(macronum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
#                   optimizer='rmsprop',
                  optimizer='adamax',
                  metrics=['acc'])

print("Simplified convolutional neural network")

model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
metrics = Metrics()
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=5, batch_size=8,callbacks=[cp,metrics])

print(history)
#print("confusion--->>",metrics.confusion_matrixs)
print("f1_score-->>",metrics.val_f1s)
print("precision---->>",metrics.val_precisions)
print("recalls----->>",metrics.val_recalls)

print(metrics.predict)
print(metrics.target)

y_predict1 = model.predict(x_test)

y_predict = (y_predict1>0.5)
accuracy = Get_Accuracy(y_test,y_predict)
print("CNN Accuracy_Score = %f"%accuracy) 
precision = Get_Precision_score(y_test,y_predict)
print("CNN Precision = %f"%precision)
recall = Get_Recall(y_test,y_predict)
print("CNN Recall = %f"%recall) 
f1_score1 = Get_f1_score(y_test,y_predict)
print("CNN F1-Score  = %f"%f1_score1)

Number of Unique Tokens 80403
Shape of Data Tensor: (12688, 1024)
Shape of Label Tensor: (12688, 2)
Total 71291 word vectors in Glove 6B 300d.
Simplified convolutional neural network
Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 1024)              0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 1024, 300)         24121200  
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 1020, 128)         192128    
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 204, 128)          0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 204, 128)          0         
_________________________________________________________________
conv1d_2

In [18]:
y_predict

array([[False,  True],
       [False,  True],
       [False,  True],
       ...,
       [False,  True],
       [False,  True],
       [False,  True]])