In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time
import sklearn.datasets
import random
import re
import pandas as pd
import feather

In [2]:
def separate_dataset(trainset, ratio = 0.5):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        data_ = random.sample(data_, int(len(data_) * ratio))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = ' '.join(string)
    return string.lower()

trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)

train = pd.DataFrame(trainset.data,columns=['sequence'])
label = pd.DataFrame(trainset.target,columns=['labels'])

In [3]:
num_words = 500000
column_name="word_seg"
word_seq_len = 1800
embedding_vector=200 

In [4]:
from keras.preprocessing import text,sequence
from gensim.models import Word2Vec
num_words = 500000
import os
import numpy as np
#词向量
def w2v_pad(df_train,col, maxlen_,victor_size):
    
    tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
    tokenizer.fit_on_texts(list(df_train[col].values))
    #padding sequence
    train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
    
    word_index = tokenizer.word_index
    
    count = 0
    nb_words = len(word_index)
    print(nb_words)
    all_data=df_train[col]
    file_name = './Word2Vec_' + col  +"_"+ '200' + '.model'
    if not os.path.exists(file_name):
        #word2vec建模
        model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
                         size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
        model.save(file_name)
    else:
        model = Word2Vec.load(file_name)
    print("add word2vec finished....")    



    embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
    for word, i in word_index.items():
        embedding_vector = model[word] if word in model else None
        if embedding_vector is not None:
            count += 1
            embedding_word2vec_matrix[i] = embedding_vector
        else:
            unk_vec = np.random.random(victor_size) * 0.5
            unk_vec = unk_vec - unk_vec.mean()
            embedding_word2vec_matrix[i] = unk_vec

    
    return train_, word_index, embedding_word2vec_matrix

Using TensorFlow backend.


In [5]:
train_, word2idx, word_embedding = w2v_pad(train, 'sequence', 40, 200)

20465
add word2vec finished....




In [6]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

lb = LabelEncoder()
train_label = lb.fit_transform(label['labels'].values)
train_label = to_categorical(train_label)#变为one-hot向量

In [16]:
class Model:
    def __init__(self, size_layer, num_layers, embedding_matrix, 
                 dimension_output, learning_rate):
        def cells(size=128, reuse = False):
            return tf.nn.rnn_cell.BasicLSTMCell(size, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        encoder_embeddings = tf.Variable(embedding_matrix)
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        drop = tf.contrib.rnn.DropoutWrapper(rnn_cells, output_keep_prob=0.5)
        outputs, _ = tf.nn.dynamic_rnn(drop, encoder_embedded, dtype=tf.float64)
        
        W = tf.get_variable('w',shape=(128, dimension_output),dtype=tf.float64,initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),dtype=tf.float64,initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
size_layer = 128
num_layers = 1
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128

In [18]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,word_embedding,dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.


In [19]:
train_X, test_X, train_Y, test_Y = train_test_split(train_, train_label, test_size = 0.2)

In [20]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = train_X[i:i+batch_size]
        batch_y = train_Y[i:i+batch_size]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = test_X[i:i+batch_size]
        batch_y = test_Y[i:i+batch_size]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : batch_y})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1

epoch: 0, pass acc: 0.000000, current acc: 0.602051
time taken: 4.527921915054321
epoch: 0, training loss: 0.682814, training acc: 0.559896, valid loss: 0.663147, valid acc: 0.602051

epoch: 1, pass acc: 0.602051, current acc: 0.711914
time taken: 3.640389919281006
epoch: 1, training loss: 0.575495, training acc: 0.701468, valid loss: 0.601964, valid acc: 0.711914

epoch: 2, pass acc: 0.711914, current acc: 0.748047
time taken: 3.5011260509490967
epoch: 2, training loss: 0.288120, training acc: 0.885653, valid loss: 0.575888, valid acc: 0.748047

time taken: 3.4591550827026367
epoch: 3, training loss: 0.111530, training acc: 0.961648, valid loss: 0.857007, valid acc: 0.748047

time taken: 3.3996968269348145
epoch: 4, training loss: 0.055373, training acc: 0.982481, valid loss: 0.879291, valid acc: 0.742676

time taken: 3.6910409927368164
epoch: 5, training loss: 0.025955, training acc: 0.991832, valid loss: 1.195721, valid acc: 0.726074

time taken: 3.7799019813537598
epoch: 6, trainin