In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_pickle("/content/drive/My Drive/Project/EIbinary.pkl") 

In [None]:
import nltk
import string

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    # line = text.translate(str.maketrans('', '', string.punctuation))
    line = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return ' '.join(map(str,line))

df['post'] = df.post.apply(lemmatize_text)
# df['post'] = df['post'].str.lower()

In [None]:
from bs4 import BeautifulSoup
def cleanReview(subject):
    beau = BeautifulSoup(subject)
    newSubject = beau.get_text()
    newSubject = newSubject.replace("\\", "").replace("\'", "").replace('/', '').replace('"', '').replace(',', '').replace('.', '').replace('?', '').replace('(', '').replace(')', '').replace('|||', '')
    newSubject = newSubject.strip().split(" ")
    newSubject = [word.lower() for word in newSubject]
    newSubject = " ".join(newSubject)
    
    return newSubject

In [None]:
train_x = df['post'].apply(cleanReview)
df=pd.concat([train_x, df['type']], axis=0) 



KeyboardInterrupt: ignored

In [None]:
df.to_csv("/content/drive/My Drive/Project/wordEmbedding.txt", index=False)

###Word2Vec and Bi-LSTM

In [None]:
import logging
import gensim
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence("/content/drive/MyDrive/Project/wordEmbedding.txt")
# model = gensim.models.Word2Vec(sentences, size=1000, sg=1, iter=8)  
# model.wv.save_word2vec_format("/content/drive/My Drive/Project/word2Vec" + ".bin", binary=True) 
wordVec = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/MyDrive/Project/word2Vec.bin", binary=True)

In [None]:
import os
import csv
import time
import datetime
import random
import json

import warnings
from collections import Counter
from math import sqrt

import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
warnings.filterwarnings("ignore")

In [None]:
class TrainingConfig(object):
    epoches = 5
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate = 0.001
    
class ModelConfig(object):
    embeddingSize = 1000
    hiddenSizes = [256, 256]
    dropoutKeepProb = 0.5
    l2RegLambda = 0.0
    
class Config(object):
    sequenceLength = 200 
    batchSize = 64
    dataSource = "/content/drive/MyDrive/Project/EIonly.csv"
    stopWordSource = "/content/drive/MyDrive/Project/english.txt"
    numClasses = 1  
    rate = 0.9  
    training = TrainingConfig()
    model = ModelConfig()
    
config = Config()

In [None]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource  
        self._sequenceLength = config.sequenceLength  
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        self._stopWordDict = {}
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        self.wordEmbedding = None
        self.labelList = []
        
    def _readData(self, filePath):
        df = pd.read_csv(filePath)
        labels = df["type"].tolist()
        review = df["post"].tolist()
        reviews = [line.strip().split() for line in review]
        return reviews, labels
    
    def _labelToIndex(self, labels, label2idx):
        labelIds = [label2idx[label] for label in labels]
        return labelIds
    
    def _wordToIndex(self, reviews, word2idx):
        reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
        return reviewIds
        
    def _genTrainEvalData(self, x, y, word2idx, rate):
        reviews = []
        for review in x:
            if len(review) >= self._sequenceLength:
                reviews.append(review[:self._sequenceLength])
            else:
                reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
        trainIndex = int(len(x) * rate)
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.array(y[:trainIndex], dtype="float32")
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.array(y[trainIndex:], dtype="float32")
        return trainReviews, trainLabels, evalReviews, evalLabels
        
    def _genVocabulary(self, reviews, labels):
        allWords = [word for review in reviews for word in review]
        subWords = [word for word in allWords if word not in self.stopWordDict]
        wordCount = Counter(subWords) 
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortWordCount if item[1] >= 5]
        vocab, wordEmbedding = self._getWordEmbedding(words)
        self.wordEmbedding = wordEmbedding
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        with open("/content/drive/MyDrive/Project/wordJson/word2idx.json", "w", encoding="utf-8") as f:
            json.dump(word2idx, f)
        with open("/content/drive/MyDrive/Project/wordJson/label2idx.json", "w", encoding="utf-8") as f:
            json.dump(label2idx, f)
        return word2idx, label2idx
            
    def _getWordEmbedding(self, words):
        wordVec = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/MyDrive/Project/word2Vec.bin", binary=True)
        vocab = []
        wordEmbedding = []
        vocab.append("PAD")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                continue        
        return vocab, np.array(wordEmbedding)
    
    def _readStopWord(self, stopWordPath):
        with open(stopWordPath, "r") as f:
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
            
    def dataGen(self):
        self._readStopWord(self._stopWordSource)
        reviews, labels = self._readData(self._dataSource)
        word2idx, label2idx = self._genVocabulary(reviews, labels)
        labelIds = self._labelToIndex(labels, label2idx)
        reviewIds = self._wordToIndex(reviews, word2idx)
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewIds, labelIds, word2idx, self._rate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels
        
data = Dataset(config)
data.dataGen()

In [None]:
def nextBatch(x, y, batchSize):    
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]
    numBatches = len(x) // batchSize
    for i in range(numBatches):
        start = i * batchSize
        end = start + batchSize
        batchX = np.array(x[start: end], dtype="int64")
        batchY = np.array(y[start: end], dtype="float32")
        yield batchX, batchY

In [None]:
class BiLSTM(object):
    def __init__(self, config, wordEmbedding):
        self.inputX = tf.compat.v1.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.compat.v1.placeholder(tf.int32, [None], name="inputY")
        self.dropoutKeepProb = tf.compat.v1.placeholder(tf.float32, name="dropoutKeepProb")
        l2Loss = tf.constant(0.0)
        with tf.name_scope("embedding"):
            self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec") ,name="W")
            self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
            
        with tf.name_scope("Bi-LSTM"):
            for idx, hiddenSize in enumerate(config.model.hiddenSizes):
                with tf.name_scope("Bi-LSTM" + str(idx)):
                    lstmFwCell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                                                                 output_keep_prob=self.dropoutKeepProb)
                    lstmBwCell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                                                                 output_keep_prob=self.dropoutKeepProb)

                    outputs, self.current_state = tf.compat.v1.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell, 
                                                                                  self.embeddedWords, dtype=tf.float32,
                                                                                  scope="bi-lstm" + str(idx))
                    self.embeddedWords = tf.concat(outputs, 2)
        
        finalOutput = self.embeddedWords[:, 0, :]
        outputSize = config.model.hiddenSizes[-1] * 2 
        output = tf.reshape(finalOutput, [-1, outputSize])  
        
        with tf.name_scope("output"):
            outputW = tf.compat.v1.get_variable(
                "outputW",
                shape=[outputSize, config.numClasses],
                initializer=tf.keras.initializers.glorot_normal)
            
            outputB= tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB")
            l2Loss += tf.nn.l2_loss(outputW)
            l2Loss += tf.nn.l2_loss(outputB)
            self.logits = tf.compat.v1.nn.xw_plus_b(output, outputW, outputB, name="logits")
            if config.numClasses == 1:
                self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions")
            elif config.numClasses > 1:
                self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")
        
        with tf.name_scope("loss"):    
            if config.numClasses == 1:
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), 
                                                                                                    dtype=tf.float32))
            elif config.numClasses > 1:
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.inputY)
            self.loss = tf.reduce_mean(losses) + config.model.l2RegLambda * l2Loss

In [None]:
def mean(item: list) -> float:
    res = sum(item) / len(item) if len(item) > 0 else 0
    return res


def accuracy(pred_y, true_y):
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == true_y[i]:
            corr += 1
    acc = corr / len(pred_y) if len(pred_y) > 0 else 0
    return acc


def binary_precision(pred_y, true_y, positive=1):
    corr = 0
    pred_corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == positive:
            pred_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    prec = corr / pred_corr if pred_corr > 0 else 0
    return prec


def binary_recall(pred_y, true_y, positive=1):
    corr = 0
    true_corr = 0
    for i in range(len(pred_y)):
        if true_y[i] == positive:
            true_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    rec = corr / true_corr if true_corr > 0 else 0
    return rec


def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
    precision = binary_precision(pred_y, true_y, positive)
    recall = binary_recall(pred_y, true_y, positive)
    try:
        f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
    except:
        f_b = 0
    return f_b


def multi_precision(pred_y, true_y, labels):
    """
    多类的精确率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    precisions = [binary_precision(pred_y, true_y, label) for label in labels]
    prec = mean(precisions)
    return prec


def multi_recall(pred_y, true_y, labels):
    """
    多类的召回率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    recalls = [binary_recall(pred_y, true_y, label) for label in labels]
    rec = mean(recalls)
    return rec


def multi_f_beta(pred_y, true_y, labels, beta=1.0):
    """
    多类的f beta值
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :param beta: beta值
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
    f_beta = mean(f_betas)
    return f_beta


def get_binary_metrics(pred_y, true_y, f_beta=1.0):
    acc = accuracy(pred_y, true_y)
    recall = binary_recall(pred_y, true_y)
    precision = binary_precision(pred_y, true_y)
    f_beta = binary_f_beta(pred_y, true_y, f_beta)
    return acc, recall, precision, f_beta


def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
    acc = accuracy(pred_y, true_y)
    recall = multi_recall(pred_y, true_y, labels)
    precision = multi_precision(pred_y, true_y, labels)
    f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
    return acc, recall, precision, f_beta

In [None]:
trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels

wordEmbedding = data.wordEmbedding
labelList = data.labelList

with tf.Graph().as_default():
    session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth=True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9  
    sess = tf.compat.v1.Session(config=session_conf)
    
    with sess.as_default():
        lstm = BiLSTM(config, wordEmbedding)
        
        globalStep = tf.compat.v1.train.get_or_create_global_step()
        optimizer = tf.compat.v1.train.AdamOptimizer(config.training.learningRate)
        gradsAndVars = optimizer.compute_gradients(lstm.loss)
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        
        gradSummaries = []
        for g, v in gradsAndVars:
            if g is not None:
                tf.summary.histogram("{}/grad/hist".format(v.name), g)
                tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
        
        outDir = os.path.abspath("/content/drive/MyDrive/Project/Bi-LSTM/summarys")
        print("Writing to {}\n".format(outDir))
        
        lossSummary = tf.compat.v1.summary.scalar("loss", lstm.loss)
        summaryOp = tf.compat.v1.summary.merge_all()

        
        trainSummaryDir = os.path.join(outDir, "train")
        trainSummaryWriter = tf.compat.v1.summary.FileWriter(trainSummaryDir, sess.graph)
        
        evalSummaryDir = os.path.join(outDir, "eval")
        evalSummaryWriter = tf.compat.v1.summary.FileWriter(evalSummaryDir, sess.graph)
        
        
        saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=5)
        saver = tf.compat.v1.train.import_meta_graph('/content/drive/MyDrive/Project/Bi-LSTM/model/my-model-13100.meta')
        saver.restore(sess
                      , tf.compat.v1.train.latest_checkpoint('/content/drive/MyDrive/Project/Bi-LSTM/model'))
        
        graph = tf.compat.v1.get_default_graph()
        inputX = graph.get_tensor_by_name("inputX:0")
        inputY = graph.get_tensor_by_name("inputY:0")
        dropoutKeepProb = graph.get_tensor_by_name("dropoutKeepProb:0")

        feed_dict = {inputX, inputY,dropoutKeepProb}

        def trainStep(batchX, batchY):
            """
            训练函数
            """   
            feed_dict = {
              lstm.inputX: batchX,
              lstm.inputY: batchY,
              lstm.dropoutKeepProb: config.model.dropoutKeepProb
            }
            _, summary, step, loss, predictions = sess.run([trainOp, summaryOp, globalStep, lstm.loss, lstm.predictions], feed_dict)
            
            timeStr = datetime.datetime.now().isoformat()
            
            if config.numClasses == 1:
                acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
                
            elif config.numClasses > 1:
                acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY,
                                                              labels=labelList)
                
            trainSummaryWriter.add_summary(summary, step)
            
            return loss, acc, prec, recall, f_beta

        def devStep(batchX, batchY):
            feed_dict = {
              lstm.inputX: batchX,
              lstm.inputY: batchY,
              lstm.dropoutKeepProb: 1.0
            }
            summary, step, loss, predictions = sess.run(
                [summaryOp, globalStep, lstm.loss, lstm.predictions],
                feed_dict)
            
            if config.numClasses == 1:
            
                acc, precision, recall, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
            elif config.numClasses > 1:
                acc, precision, recall, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
            
            evalSummaryWriter.add_summary(summary, step)
            
            return loss, acc, precision, recall, f_beta
        
        for i in range(config.training.epoches):
            print("start training model")
            for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
                loss, acc, prec, recall, f_beta = trainStep(batchTrain[0], batchTrain[1])
                
                
                currentStep = tf.compat.v1.train.global_step(sess, globalStep) 
                print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
                    currentStep, loss, acc, recall, prec, f_beta))
                if currentStep % config.training.evaluateEvery == 0:
                    print("\nEvaluation:")
                    
                    losses = []
                    accs = []
                    f_betas = []
                    precisions = []
                    recalls = []
                    
                    for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize):
                        loss, acc, precision, recall, f_beta = devStep(batchEval[0], batchEval[1])
                        losses.append(loss)
                        accs.append(acc)
                        f_betas.append(f_beta)
                        precisions.append(precision)
                        recalls.append(recall)
                        
                    time_str = datetime.datetime.now().isoformat()
                    print("{}, step: {}, loss: {}, acc: {},precision: {}, recall: {}, f_beta: {}".format(time_str, currentStep, mean(losses), 
                                                                                                       mean(accs), mean(precisions),
                                                                                                       mean(recalls), mean(f_betas)))
                    
                if currentStep % config.training.checkpointEvery == 0:
                    path = saver.save(sess, "/content/drive/MyDrive/Project/Bi-LSTM/model/my-model", global_step=currentStep)
                    print("Saved model checkpoint to {}\n".format(path))
                    
        inputs = {"inputX": tf.saved_model.utils.build_tensor_info(lstm.inputX),
                  "keepProb": tf.saved_model.utils.build_tensor_info(lstm.dropoutKeepProb)}

        outputs = {"predictions": tf.saved_model.utils.build_tensor_info(lstm.predictions)}

        prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, outputs=outputs,
                                                                                      method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
        builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING],
                                            signature_def_map={"predict": prediction_signature}, legacy_init_op=legacy_init_op)

        builder.save()

Writing to /content/drive/MyDrive/Project/Bi-LSTM/summarys

start training model
train: step: 13101, loss: 0.5577394962310791, acc: 0.625, recall: 0.35294117647058826, precision: 0.8571428571428571, f_beta: 0.5
train: step: 13102, loss: 0.5218223333358765, acc: 0.6875, recall: 0.45714285714285713, precision: 0.9411764705882353, f_beta: 0.6153846153846154
train: step: 13103, loss: 0.4933256506919861, acc: 0.75, recall: 0.5862068965517241, precision: 0.8095238095238095, f_beta: 0.68
train: step: 13104, loss: 0.5491538643836975, acc: 0.640625, recall: 0.4782608695652174, precision: 0.5, f_beta: 0.4888888888888889
train: step: 13105, loss: 0.5740712285041809, acc: 0.703125, recall: 0.5757575757575758, precision: 0.7916666666666666, f_beta: 0.6666666666666667
train: step: 13106, loss: 0.5120251774787903, acc: 0.734375, recall: 0.5555555555555556, precision: 0.75, f_beta: 0.6382978723404256
train: step: 13107, loss: 0.5779334306716919, acc: 0.671875, recall: 0.6296296296296297, precision: 0.

Instructions for updating:
Use standard file APIs to delete files with this prefix.


Saved model checkpoint to /content/drive/MyDrive/Project/Bi-LSTM/model/my-model-13700

train: step: 13701, loss: 0.5855984687805176, acc: 0.71875, recall: 0.6666666666666666, precision: 0.7142857142857143, f_beta: 0.689655172413793
train: step: 13702, loss: 0.48769986629486084, acc: 0.75, recall: 0.5, precision: 0.75, f_beta: 0.6
train: step: 13703, loss: 0.5835672616958618, acc: 0.71875, recall: 0.5483870967741935, precision: 0.8095238095238095, f_beta: 0.6538461538461537
train: step: 13704, loss: 0.566433310508728, acc: 0.625, recall: 0.5142857142857142, precision: 0.72, f_beta: 0.6
train: step: 13705, loss: 0.611892580986023, acc: 0.734375, recall: 0.5666666666666667, precision: 0.8095238095238095, f_beta: 0.6666666666666666
train: step: 13706, loss: 0.5776333808898926, acc: 0.640625, recall: 0.48, precision: 0.5454545454545454, f_beta: 0.5106382978723404
train: step: 13707, loss: 0.5132969617843628, acc: 0.71875, recall: 0.6785714285714286, precision: 0.6785714285714286, f_beta: 0.