In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd '/content/drive/My Drive/Colab Notebooks/mini project 4/Sentence Pair Classification'

/content/drive/My Drive/Colab Notebooks/mini project 4/Sentence Pair Classification


In [8]:
%pwd

'/content/drive/My Drive/Colab Notebooks/mini project 4/Sentence Pair Classification'

# Data Preprocessing


In [11]:
import re
import jieba
import random
import csv
from tensorflow.contrib import learn


class data_prepare(object):

    def readfile(self, filename):
        texta = []
        textb = []
        tag = []
        with open(filename) as tsv_f:
            reader = csv.reader(tsv_f, delimiter='\t')
            for row in reader:
                texta.append(self.pre_processing(row[1]))
                textb.append(self.pre_processing(row[2]))
                tag.append(row[0])

        # shuffle
        index = [x for x in range(len(texta))]
        random.shuffle(index)
        texta_new = [texta[x] for x in index]
        textb_new = [textb[x] for x in index]
        tag_new = [tag[x] for x in index]

        type = list(set(tag_new))
        dicts = {}
        tags_vec = []
        for x in tag_new:
            if x not in dicts.keys():
                dicts[x] = 1
            else:
                dicts[x] += 1
            temp = [0] * len(type)
            temp[int(x)] = 1
            tags_vec.append(temp)
        print(dicts)
        return texta_new, textb_new, tags_vec

    def pre_processing(self, text):
        text = re.sub('（[^（.]*）', '', text)
        text = ''.join([x for x in text if '\u4e00' <= x <= '\u9fa5'])
        words = ' '.join(jieba.cut(text)).split(" ")
        words = [x for x in ''.join(words)]
        return ' '.join(words)

    def build_vocab(self, sentences, path):
        lens = [len(sentence.split(" ")) for sentence in sentences]
        max_length = max(lens)
        vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
        vocab_processor.fit(sentences)
        vocab_processor.save(path)


if __name__ == '__main__':
    data_pre = data_prepare()
    data_pre.readfile('dataset/sent_pair/bq/train.tsv')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.722 seconds.
Prefix dict has been built succesfully.


{'0': 43054, '1': 43146}


# Model: Text CNN


In [0]:
import tensorflow as tf
import numpy as np
import math


class TextCNN(object):
    def __init__(
        self, sent_len, l2_reg, vocabulary_size,  num_filters, filter_sizes, embedding_size=300, di=50,
        num_classes=2, num_layers=2):
        """
        

        :param sent_len: sentence length
        :param num_filters: number of filters per filter size
        :param filter_sizes: list of filter sizes
        :param w: filter width
        :param l2_reg: L2 regularization coefficient
        :param num_features: The number of pre-set features(not coming from CNN) used in the output layer.
        :param embedding_size: dimensionality of word embedding(default: 300)
        :param di: The number of convolution kernels (default: 50)
        :param num_classes: The number of classes for answers.
        :param num_layers: The number of convolution layers.
        """
        self.text_a = tf.placeholder(tf.int32, shape=[None, sent_len], name="text_a")
        self.text_b = tf.placeholder(tf.int32, shape=[None, sent_len], name="text_b")
        self.y = tf.placeholder(tf.int32, shape=[None, num_classes], name="y")
        self.dropout = tf.placeholder(tf.float32, name="dropout")
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        self.global_step = tf.Variable(0, trainable=False, name="Global_Step")

        def _linear(input_, output_size, scope="SimpleLinear"):
            shape = input_.get_shape().as_list()
            if len(shape) != 2:
                raise ValueError("Linear is expecting 2D arguments: {0}".format(str(shape)))
            if not shape[1]:
                raise ValueError("Linear expects shape[1] of arguments: {0}".format(str(shape)))
            input_size = shape[1]

            # Now the computation.
            with tf.variable_scope(scope):
                W = tf.get_variable("W", [input_size, output_size], dtype=input_.dtype)
                b = tf.get_variable("b", [output_size], dtype=input_.dtype)

            return tf.nn.xw_plus_b(input_, W, b)

        def _highway_layer(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu):
            """
            Highway Network (cf. http://arxiv.org/abs/1505.00387).
            t = sigmoid(Wy + b)
            z = t * g(Wy + b) + (1 - t) * y
            where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
            """

            for idx in range(num_layers):
                g = f(_linear(input_, size, scope=("highway_lin_{0}".format(idx))))
                t = tf.sigmoid(_linear(input_, size, scope=("highway_gate_{0}".format(idx))) + bias)
                output = t * g + (1. - t) * input_
                input_ = output

            return output

        # Embedding Layer

        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.vocab_matrix = tf.Variable(tf.truncated_normal(shape=[vocabulary_size, embedding_size],
                                                                stddev=1.0 / math.sqrt(embedding_size)),
                                            name='vacab_matrix')
            self.x1 = tf.nn.embedding_lookup(self.vocab_matrix, self.text_a)
            self.x2 = tf.nn.embedding_lookup(self.vocab_matrix, self.text_b)

        self.expand_x1 = tf.expand_dims(self.x1, axis=-1)
        self.expand_x2 = tf.expand_dims(self.x2, axis=-1)


        #create a convolution and maxpool layer for each filter size
        pooled_output_x1 = []
        pooled_output_x2 = []

        for filter_size in filter_sizes:
            with tf.name_scope("conv-filter{0}".format(filter_size)):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W")
                b = tf.Variable(tf.constant(value=0.1, shape=[num_filters], dtype=tf.float32), name="b")
                conv_front = tf.nn.conv2d(
                    self.expand_x1,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv_front")

                conv_behind = tf.nn.conv2d(
                    self.expand_x2,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv_behind")
        
                # Batch Normalization Layer

                conv_bn_x1 = tf.layers.batch_normalization(tf.nn.bias_add(conv_front, b), training=self.is_training)
                conv_bn_x2 = tf.layers.batch_normalization(tf.nn.bias_add(conv_behind,b), training=self.is_training)

                # Apply nonlinearity

                conv_out_x1 = tf.nn.relu(conv_bn_x1, name="relu_front")
                conv_out_x2 = tf.nn.relu(conv_bn_x2, name="relu_behind")

            with tf.name_scope("pool-filter{0}".format(filter_size)):
                # Maxpooling over the outputs
                pooled_front = tf.nn.max_pool(
                    conv_out_x1,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="pool_front")

                pooled_behind = tf.nn.max_pool(
                    conv_out_x2,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="pool_behind")

            pooled_outputs_front.append(pooled_front)
            pooled_outputs_behind.append(pooled_behind)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.pool_front = tf.concat(pooled_outputs_front, axis=3)
        self.pool_behind = tf.concat(pooled_outputs_behind, axis=3)
        self.pool_flat_front = tf.reshape(self.pool_front, shape=[-1, num_filters_total])
        self.pool_flat_behind = tf.reshape(self.pool_behind, shape=[-1, num_filters_total])

        self.pool_flat_combine = tf.concat([self.pool_flat_front, self.pool_flat_behind], axis=1)

        # Fully Connected Layer
        with tf.name_scope("fc"):
            W = tf.Variable(tf.truncated_normal(shape=[num_filters_total * 2, fc_hidden_size],
                                                stddev=0.1, dtype=tf.float32), name="W")
            b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b")
            self.fc = tf.nn.xw_plus_b(self.pool_flat_combine, W, b)

            # Batch Normalization Layer
            self.fc_bn = tf.layers.batch_normalization(self.fc, training=self.is_training)

            # Apply nonlinearity
            self.fc_out = tf.nn.relu(self.fc_bn, name="relu")

        # Highway Layer
        with tf.name_scope("highway"):
            self.highway = _highway_layer(self.fc_out, self.fc_out.get_shape()[1], num_layers=1, bias=0)

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.highway, self.dropout_keep_prob)

        # Final scores and predictions
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, num_classes],
                                                stddev=0.1, dtype=tf.float32), name="W")
            b = tf.Variable(tf.constant(value=0.1, shape=[num_classes], dtype=tf.float32), name="b")
            self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logits")
            self.softmax_scores = tf.nn.softmax(self.logits, name="softmax_scores")
            self.predictions = tf.argmax(self.logits, 1, name="predictions")
            self.topKPreds = tf.nn.top_k(self.softmax_scores, k=1, sorted=True, name="topKPreds")

        # Calculate mean cross-entropy loss, L2 loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.input_y, logits=self.logits)
            losses = tf.reduce_mean(losses, name="softmax_losses")
            l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()],
                                 name="l2_losses") * l2_reg_lambda
            self.loss = tf.add(losses, l2_losses, name="loss")

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

        # TODO: Reconsider the metrics calculation
        # Number of correct predictions
        with tf.name_scope("num_correct"):
            correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.num_correct = tf.reduce_sum(tf.cast(correct, "float"), name="num_correct")

        # Calculate Fp
        with tf.name_scope("fp"):
            fp = tf.metrics.false_positives(labels=tf.argmax(self.input_y, 1), predictions=self.predictions)
            self.fp = tf.reduce_sum(tf.cast(fp, "float"), name="fp")

        # Calculate Fn
        with tf.name_scope("fn"):
            fn = tf.metrics.false_negatives(labels=tf.argmax(self.input_y, 1), predictions=self.predictions)
            self.fn = tf.reduce_sum(tf.cast(fn, "float"), name="fn")

        # Calculate Recall
        with tf.name_scope("recall"):
            self.recall = self.num_correct / (self.num_correct + self.fn)

        # Calculate Precision
        with tf.name_scope("precision"):
            self.precision = self.num_correct / (self.num_correct + self.fp)

        # Calculate F1
        with tf.name_scope("F1"):
            self.F1 = (2 * self.precision * self.recall) / (self.precision + self.recall)

        # Calculate AUC
        with tf.name_scope("AUC"):
            self.AUC = tf.metrics.auc(self.softmax_scores, self.input_y, name="AUC")



# if __name__ == '__main__':
#     cnn = TextCNN(20, 0.001, vocabulary_size=1000, num_filters=128, filter_sizes="3,4,5")

# Hyperparameters


In [14]:
data_prepare.readfile

<function __main__.data_prepare.readfile>

In [0]:
import tensorflow as tf
import data_prepare
from tensorflow.contrib import learn
import numpy as np
import abcnn_mdoel
import abcnn_model_pre
import config as config
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn import metrics
import os

con = config.Config()
parent_path = os.path.dirname(os.getcwd())
data_pre = data_prepare.Data_Prepare()


class TrainModel(object):
    def pre_processing(self):
        train_texta, train_textb, train_tag = data_pre.readfile(parent_path+'/data/train.txt')
        data = []
        data.extend(train_texta)
        data.extend(train_textb)
        data_pre.build_vocab(data, parent_path+'/save_model' + '/abcnn/vocab.pickle')
        # 加载词典
        self.vocab_processor = learn.preprocessing.VocabularyProcessor.restore(parent_path+'/save_model/abcnn' +
                                                                               '/vocab.pickle')
        train_texta_embedding = np.array(list(self.vocab_processor.transform(train_texta)))
        train_textb_embedding = np.array(list(self.vocab_processor.transform(train_textb)))

        dev_texta, dev_textb, dev_tag = data_pre.readfile(parent_path+'/data/dev.txt')
        dev_texta_embedding = np.array(list(self.vocab_processor.transform(dev_texta)))
        dev_textb_embedding = np.array(list(self.vocab_processor.transform(dev_textb)))
        return train_texta_embedding, train_textb_embedding, np.array(train_tag), \
               dev_texta_embedding, dev_textb_embedding, np.array(dev_tag)

    def get_batches(self, texta, textb, tag):
        num_batch = int(len(texta) / con.Batch_Size)
        for i in range(num_batch):
            a = texta[i*con.Batch_Size:(i+1)*con.Batch_Size]
            b = textb[i*con.Batch_Size:(i+1)*con.Batch_Size]
            t = tag[i*con.Batch_Size:(i+1)*con.Batch_Size]
            yield a, b, t

    def trainModel(self):
        train_texta_embedding, train_textb_embedding, train_tag, \
        dev_texta_embedding, dev_textb_embedding, dev_tag = self.pre_processing()

        # 定义训练用的循环神经网络模型
        # abcnn
        # DEFAULT_CONFIG = [{'type': 'ABCNN-1', 'w': 3, 'n': 50, 'nl': 'tanh'} for _ in range(3)]
        # model = abcnn_mdoel.ABCNN(True, learning_rate=con.learning_rate, conv_layers=1, embed_size=con.embedding_size,
        #                           vocabulary_size=len(self.vocab_processor.vocabulary_),
        #                           sentence_len=len(train_texta_embedding[0]), config=DEFAULT_CONFIG)
        model = abcnn_model_pre.ABCNN(True, len(train_texta_embedding[0]), 3, con.l2_lambda, 'ABCNN3',
                                      vocabulary_size=len(self.vocab_processor.vocabulary_), d0=con.embedding_size,
                                      di=50, num_classes=2, num_layers=1)

        # 训练模型
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            tf.global_variables_initializer().run()
            saver = tf.train.Saver()
            best_f1 = 0.0
            for time in range(con.epoch):
                print("training " + str(time + 1) + ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                model.is_trainning = True
                loss_all = []
                accuracy_all = []
                for texta, textb, tag in tqdm(
                        self.get_batches(train_texta_embedding, train_textb_embedding, train_tag)):
                    feed_dict = {
                        model.text_a: texta,
                        model.text_b: textb,
                        model.y: tag
                    }
                    _, cost, accuracy = sess.run([model.train_op, model.loss, model.accuracy], feed_dict)
                    loss_all.append(cost)
                    accuracy_all.append(accuracy)

                print("第" + str((time + 1)) + "次迭代的损失为：" + str(np.mean(np.array(loss_all))) + ";准确率为：" +
                      str(np.mean(np.array(accuracy_all))))

                def dev_step():
                    """
                    Evaluates model on a dev set
                    """
                    loss_all = []
                    accuracy_all = []
                    predictions = []
                    for texta, textb, tag in tqdm(
                            self.get_batches(dev_texta_embedding, dev_textb_embedding, dev_tag)):
                        feed_dict = {
                            model.text_a: texta,
                            model.text_b: textb,
                            model.y: tag
                        }
                        dev_cost, dev_accuracy, prediction = sess.run([model.loss, model.accuracy,
                                                                       model.prediction], feed_dict)
                        loss_all.append(dev_cost)
                        accuracy_all.append(dev_accuracy)
                        predictions.extend(prediction)
                    y_true = [np.nonzero(x)[0][0] for x in dev_tag]
                    y_true = y_true[0:len(loss_all)*con.Batch_Size]
                    f1 = f1_score(np.array(y_true), np.array(predictions), average='weighted')
                    print('分类报告:\n', metrics.classification_report(np.array(y_true), predictions))
                    print("验证集：loss {:g}, acc {:g}, f1 {:g}\n".format(np.mean(np.array(loss_all)),
                                                                      np.mean(np.array(accuracy_all)), f1))
                    return f1

                model.is_trainning = False
                f1 = dev_step()

                if f1 > best_f1:
                    best_f1 = f1
                    saver.save(sess, parent_path + "/save_model/abcnn/model.ckpt")
                    print("Saved model success\n")


if __name__ == '__main__':
    train = TrainModel()
    train.trainModel()