# CoreNLP server

In [7]:
# init and start CoreNLP server
import os

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The system cannot find the path specified.


In [2]:
# !wget "https://nlp.stanford.edu/software/stanford-corenlp-latest.zip"
# !unzip "stanford-corenlp-latest.zip"

In [8]:
cd ./stanford-corenlp-4.5.1/
!nohup java -mx5g -cp "./*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 10000 &

# shut down CoreNLP server

# !ps aux | grep java
# !kill 719

SyntaxError: invalid syntax (<ipython-input-8-84431c963d7b>, line 1)

In [16]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xuzhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xuzhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\xuzhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\xuzhi\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\xuzhi\AppData\Roaming\nltk_data...


True

# Code of extracting metrics

In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.parse.corenlp import CoreNLPServer, CoreNLPParser

import pickle
import string
import numpy as np
import pandas as pd
import os 
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import spacy

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

nlp_pipeline = spacy.load("en_core_web_sm")
parser = CoreNLPParser(url='http://localhost:9001')



## Input feature metrics

In [18]:
def lexical(sentence):
    doc = nlp_pipeline(sentence)
    oov = 0
    # verb
    VB = 0
    # noun
    NN = 0
    # adj
    JJ = 0
    # adv
    RB = 0
    # conj
    CONJ = 0

    length = 0

    for token in doc:
        length += 1
        if token.tag_.startswith("J"):
            JJ += 1
        if token.tag_.startswith("N"):
            NN += 1
        if token.tag_.startswith("R"):
            RB += 1
        if token.tag_.startswith("V"):
            VB += 1
        if token.tag_.startswith("C"):
            CONJ += 1

    return VB, NN, JJ, RB, CONJ, len(list(doc.sents)), length

# Degree of polysemy
def polysemy(clean_line):
    polysemyCount = 0
    words = clean_line.split(" ")
    for w in words:
        polysemyCount += len(wn.synsets(w))
    return float(polysemyCount) / float(len(words))


# dependency distance
def depend_dist(sentence):
    doc = nlp_pipeline(sentence)
    sum_dist = 0
    for sent in doc.sents:
        sent_dist = 0
        for token in sent:
            if not token.is_punct:
                for child in token.children:
                    sent_dist += abs(token.i - child.i)
        sum_dist += sent_dist
    return float(sum_dist) / float(len(list(doc.sents)))


# height of constituency parsing tree
def const_parse(sentences):
    doc = nlp_pipeline(sentences)
    sum_height = 0
    non_term_count = 0
    term_count = 0
    for sent in doc.sents:
        if sent:
            try:
                res = next(parser.raw_parse(sent.text))
            except Exception as e:
                print(e)
                print(sent)
            else:
                sum_height += res.height()
                non_term_count += len(list(res.leaves()))
                term_count += len(list(res.subtrees()))

    return (
        float(sum_height) / float(len(list(doc.sents))),
        float(non_term_count) / float(term_count),
    )


# process sentiment score
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith("J"):
        return wn.ADJ
    elif tag.startswith("N"):
        return wn.NOUN
    elif tag.startswith("R"):
        return wn.ADV
    elif tag.startswith("V"):
        return wn.VERB
    return None


def get_sentiment(word, tag):
    """ 
    returns list of pos neg and objective score. But returns empty list 
    if not present in senti wordnet. 
    """

    wn_tag = penn_to_wn(tag)

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return [0, 1, 0]

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [
            swn_synset.pos_score(), 
            swn_synset.neg_score(), 
            swn_synset.obj_score(),
    ]


def senti_pro(clean_sent):
    flip_count = 0
    words = clean_sent.split(" ")
    pos_val = nltk.pos_tag(words)
    senti_ret = np.array([get_sentiment(x, y) for (x, y) in pos_val])
    senti_val = senti_ret.T[0] - senti_ret.T[1]
    senti_score = abs(senti_ret.T[0].sum() - senti_ret.T[1].sum())
    
    # sentiment flip count
    for i in range(senti_val.shape[0] - 1):
        if senti_val[i] * senti_val[i + 1] < 0:
            flip_count += 1

    return senti_score, flip_count

## Fetch&Process data

In [19]:
def modi_data(obj_df):
    obj_df = obj_df[obj_df["label"] != 2]
    # obj_df["label"] = obj_df["label"].astype(string)
    obj_df.label = obj_df.label.replace(0, "neg")
    obj_df.label = obj_df.label.replace(1, "neg")
    obj_df.label = obj_df.label.replace(3, "pos")
    obj_df.label = obj_df.label.replace(4, "pos")
    obj_df = obj_df.fillna(0)
    # obj_df["label"] = obj_df["label"].astype(int)
    return obj_df


def clean_sent(sentence):
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


def pre_proc(sentence):
    words = sentence.split(" ")
    if len(words) > max_len:
        sentence = " ".join(words[:max_len])
    return sentence

In [20]:
# set global var

data_path = "./SST-2/"
tokenizer_path = "./tokenizer/49in_SST.pickle"
max_len = 49
remove_threshold = 1e-5

In [21]:
train_df = pd.read_csv(
    data_path + "train.csv", header=None, sep="\t", names=["label", "text"]
)

test_df = pd.read_csv(
    data_path + "test.csv", header=None, sep="\t", names=["label", "text"]
)


with open(tokenizer_path, "rb") as handle:
    tokenizer = pickle.load(handle)

test_df = modi_data(test_df)
test_sentence = test_df.text.tolist()
# test_clean_tokens = [clean_sent(i) for i in test_sentence]
test_x = tokenizer.texts_to_sequences(test_sentence)
test_x = pad_sequences(test_x, maxlen=max_len, padding="post")
test_label = test_df.label.tolist()
test_sentence = list(map(pre_proc, test_sentence))

test_y = np.array(test_label)
test_y[test_y == 'pos'] = 1
test_y[test_y == 'neg'] = 0
test_y = test_y.reshape(-1,1).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [23]:
# clean_line = [" ".join(n) for n in test_clean_tokens]
inp_feature_df = pd.DataFrame(
    {"Sentence": test_sentence, 
    #  "CleanLine": clean_line, 
     "TrueRes": test_y.reshape(-1)}
)

inp_feature_df["VBCount"], inp_feature_df["NNCount"], \
inp_feature_df["JJCount"], inp_feature_df["RBCount"], \
inp_feature_df["ConjCount"], inp_feature_df["SentCount"], \
inp_feature_df["Length"] = zip(
    *inp_feature_df["Sentence"].map(lexical)
)

inp_feature_df["Polysemy"] = inp_feature_df["Sentence"].map(polysemy)

# dependency distance
inp_feature_df["DependDist"] = inp_feature_df["Sentence"].map(depend_dist)

# sentiment process
inp_feature_df["SentiScore"], inp_feature_df["SentiFlip"] = zip(
    *inp_feature_df["Sentence"].map(senti_pro)
)

# constituency parsing tree
inp_feature_df["ConstHeight"], inp_feature_df["TerminalRatio"] = \
zip(*inp_feature_df["Sentence"].map(const_parse))

In [24]:
inp_feature_df.to_csv("./Metrics/SST_inp.csv", index=False)

## Output feature metrics

In [25]:
def get_last_layer_model(model):
    layer_names = [layer.name for layer in model.layers]
    layer_output = model.get_layer(layer_names[-2]).output
    ret = Model(model.input, layer_output)

    return ret

def get_train_at(train_df, last_layer_model):
    train_df = modi_data(train_df)
    pos_df = train_df[train_df["label"] == "pos"]
    neg_df = train_df[train_df["label"] == "neg"]

    pos_sentence = pos_df.text.tolist()
    pos_x = tokenizer.texts_to_sequences(pos_sentence)
    pos_x = pad_sequences(pos_x, maxlen = max_len, padding="post")

    neg_sentence = neg_df.text.tolist()
    neg_x = tokenizer.texts_to_sequences(neg_sentence)
    neg_x = pad_sequences(neg_x, maxlen = max_len, padding="post")
    
    ret = {}
    ret["pos"] = last_layer_model.predict(pos_x)
    ret["neg"] = last_layer_model.predict(neg_x)
    return ret

def get_kernels(train_at):
    removed_cols={'pos': [], 'neg': []}

    for i in range(train_at["pos"].T.shape[0]):
        if np.var(train_at["pos"].T[i]) < remove_threshold:
            removed_cols['pos'].append(i)
    for i in range(train_at["neg"].T.shape[0]):
        if np.var(train_at["neg"].T[i]) < remove_threshold:
            removed_cols['neg'].append(i)

    pos_vals = np.delete(train_at["pos"].T, removed_cols['pos'], axis=0)
    neg_vals = np.delete(train_at["neg"].T, removed_cols['neg'], axis=0)

    kernels={}
    kernels["pos"] = stats.gaussian_kde(pos_vals)
    kernels["neg"] = stats.gaussian_kde(neg_vals)

    return kernels, removed_cols
    

def get_lsa(kernels, removed_cols, test_pred, test_label):
    lsa=[]
    
    for i in range(len(test_pred)):
        value = np.delete(test_pred[i], removed_cols[test_label[i]])
        temp = np.negative(np.log(kernels[test_label[i]](value)))
        
        lsa.append(temp[0])
    
    return lsa

def find_closest_at(at, train_at):
    """The closest distance between subject AT and training ATs.
    Args:
        at (list): List of activation traces of an input.        
        train_at (list): List of activation traces in training set (filtered)
        
    Returns:
        dist (int): The closest distance.
        at (list): Training activation trace that has the closest distance.
    """

    dist = np.linalg.norm(at - train_at, axis=1)
    return (min(dist), train_at[np.argmin(dist)])


def get_dsa(test_pred, test_label, train_at):
    ret = []
    
    for i in range(len(test_pred)):
        label = test_label[i]
        at = test_pred[i]
        a_dist, a_dot = find_closest_at(at, train_at[label])
        b_dist, _ = find_closest_at(
            a_dot, train_at[list(set(["pos", "neg"]) - set([label]))[0]]
        )
        ret.append(a_dist / b_dist)
    return ret

def val_to_res(val):
    if val > 0.5:
        return 1
    else:
        return 0

def deep_gini(val):
    ret = float(1 - pow(val, 2) - pow(1 - val, 2))
    return ret


In [31]:
def Process(model_path, out_path):
    inp_feature_df = pd.read_csv(
        "./Metrics/SST_inp.csv",
    )

    dirs = os.listdir(model_path)
    for i in dirs:
        if os.path.splitext(i)[1] == ".hdf5":
            model = get_Trans(max_len)
            model.load_weights(model_path + i)
            # model = load_model(model_path + i)

            # verify
            test_scores = model.evaluate(test_x, test_y, verbose=0)
            print(i, test_scores)
            if test_scores[1] < float("0." + i.split('_')[1][:4]):
                raise Exception("Invalid acc! " + i, test_scores[1])
            
            pred_val = model.predict(test_x)
            pred_res = [val_to_res(n) for n in pred_val]
            output_feature_df = pd.DataFrame(
                {"PredVal": pred_val[:, 0], "PredRes": pred_res}
            )
            output_feature_df["DeepGini"] = \
            output_feature_df["PredVal"].map(deep_gini)
            
            last_layer_model = get_last_layer_model(model)
            test_pred = last_layer_model.predict(test_x)
            train_at = get_train_at(train_df, last_layer_model)
            kernels, removed_cols = get_kernels(train_at)
            lsa = get_lsa(kernels, removed_cols, test_pred, test_label)
            dsa = get_dsa(test_pred, test_label, train_at)
            output_feature_df["LSA"] = lsa
            output_feature_df["DSA"] = dsa
            
            res_df = pd.concat([inp_feature_df, output_feature_df], axis=1)

            res_df["isRight"] = res_df.apply(
                lambda x: x["TrueRes"] == x["PredRes"], axis=1
            )
            res_df.to_csv(out_path + os.path.splitext(i)[0] + ".csv")
            print("model " + os.path.splitext(i)[0] + " is saved")
    print("done!")

In [32]:
model_path = "./Model/Trans_SST/"
out_path = "./Metrics/Trans_SST/"

Process(model_path, out_path)

4484_8132.hdf5 [0.44842463731765747, 0.8127402663230896]


Exception: ('Invalid acc! 4484_8132.hdf5', 0.8127402663230896)

In [30]:
from tensorflow.keras import layers
from tensorflow.keras.layers import (
    Input,
    Dense,
    Embedding,
    Flatten,
    Conv1D,
    MaxPooling1D,
    Add,
    Lambda,
    Dropout,
    concatenate,
)

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
        })
        return config

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'vocab_size': self.vocab_size,
            'maxlen': self.maxlen,
            'embed_dim': self.embed_dim,
        })
        return config

def get_Trans(max_len):
    embed_dim = 32  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer


    embedding_layer = TokenAndPositionEmbedding(max_len, len(tokenizer.word_index) + 1, embed_dim)

    inputs = Input(shape=(max_len,), dtype="int32", name="input")
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    # x = layers.Dropout(0.1)(x)

    o = Dense(1, activation="sigmoid", name="output")(x)

    model = Model(inputs=inputs, outputs=o)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

    return model