In [1]:
import numpy as np
import pandas as pd
import sys
import random
import progressbar
import torch
import pickle
from mytree import *
from utils import *
from treeUtil import *
import tqdm
import argparse
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import functools
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import KeyedVectors
import string
from stanfordcorenlp import StanfordCoreNLP
import nltk.tree
from ast import literal_eval
import pptree
import logging

In [2]:
from nltk.tokenize import word_tokenize
import re
import os
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from logger import Logger

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)


In [3]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy.pipeline import merge_entities
nlp = en_core_web_sm.load()
nlp.add_pipe(merge_entities)

In [4]:
import copy
from sklearn.utils import shuffle
from sklearn import metrics

In [5]:
# False if in-domain; True if general
proanti = True
w2vec = True
ner = False
blackout = False
balanced = True
undersample = False
non_trainable = False
economic = True

In [6]:
dic = {0:'ProAnti', 1:'General', 2:'NER', 3:'Blackout', 4:'Balance', 5:'Undersample', 6:'Fixed'}
bool_list = [proanti, w2vec, ner, blackout, balanced, undersample, non_trainable]
corpus_path = '../data/new/economic'
namecode = 'economic_w2v'
for index, bb in enumerate(bool_list):
    namecode += '_'
    if bb:
        namecode += '1'
    else:
        namecode += '0'
    namecode += dic[index]
namecode

'economic_w2v_1ProAnti_1General_0NER_0Blackout_1Balance_0Undersample_0Fixed'

In [7]:
CUDA=False
def Var(v):
    if CUDA: return Variable(v.cuda())
    else: return Variable(v)
widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]

In [8]:
class RecursiveNN(nn.Module):
    def __init__(self, word_embeddings, vocab, embedSize=300, numClasses=2, beta = 0.3, use_weight = True, non_trainable = non_trainable):
        super(RecursiveNN, self).__init__()
#             if (w2vec):
#                 self.embedding = nn.Embedding(len(vocab), embedSize)
#                 self.embedding.load_state_dict({'weight': w2vec_weights})
        self.embedding = nn.Embedding.from_pretrained(word_embeddings)
        self.embedding.weight.requires_grad = True
        if non_trainable:
            self.embedding.weight.requires_grad = False
        else:
            self.embedding = nn.Embedding(len(vocab), embedSize)
        self.embedding = nn.Embedding(len(vocab), embedSize)
        self.W = nn.Linear(2*embedSize, embedSize, bias=True)
        self.nonLinear = torch.tanh
        self.projection = nn.Linear(embedSize, numClasses, bias=True)
        self.nodeProbList = []
        self.labelList = []
        self.loss = Var(torch.FloatTensor([0]))
        self.V = vocab
        self.beta = beta
        self.use_weight = use_weight
        self.total_rep = None #
        self.count_rep = 0 #
        self.numClasses = numClasses

    def traverse(self, node):
        if node.isLeaf:
            if node.getLeafWord() in self.V:  # check if right word is in vocabulary
                word = node.getLeafWord()
            else:  # otherwise use the unknown token
                word = 'UNK'
            # print(self.V[word],len(self.V),word,(torch.LongTensor([int(self.V[word])])))
            currentNode = (self.embedding(Var(torch.LongTensor([int(self.V[word])]))))
        else: currentNode = self.nonLinear(self.W(torch.cat((self.traverse(node.left),self.traverse(node.right)),1)))
        currentNode = currentNode/(torch.norm(currentNode))

        assert node.label!=None
        self.nodeProbList.append(self.projection(currentNode))
        # print (node.label)
        self.labelList.append(torch.LongTensor([node.label]))
        loss_weight = 1-self.beta if node.annotated else self.beta
        self.loss += (loss_weight*F.cross_entropy(input=torch.cat([self.projection(currentNode)]),target=Var(torch.cat([torch.LongTensor([node.label])]))))

        #
        if not node.isRoot():
            if self.total_rep is None:
                self.total_rep = currentNode.data.clone()
            else:
                self.total_rep += currentNode.data.clone()
            self.count_rep += 1
        #

        return currentNode        

    def forward(self, x):
        self.nodeProbList = []
        self.labelList = []
        self.loss = Var(torch.FloatTensor([0]))
        self.traverse(x)
        self.labelList = Var(torch.cat(self.labelList))
        return torch.cat(self.nodeProbList)

    def getLoss(self, tree):
        nodes = self.forward(tree)
        predictions = nodes.max(dim=1)[1]
        loss = self.loss
        return predictions,loss

    def getRep(self, tree):
        self.count_rep = 0
        self.total_rep = None
        self.nodeProbList = []
        self.labelList = []
        self.loss = Var(torch.FloatTensor([0]))

        root_rep = self.traverse(tree)

        return (torch.cat((root_rep,self.total_rep/self.count_rep),1)).data.numpy().T.flatten()


    def evaluate(self, trees):
            pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(trees)).start()
            n = nAll = correctRoot = correctAll = 0.0
            tp = [1e-2]*self.numClasses
            fp = [1e-2]*self.numClasses
            fn = [1e-2]*self.numClasses
            f1 = [0.]*self.numClasses
            for j, tree in enumerate(trees):
                predictions,_ = self.getLoss(tree.root)
#                     print((predictions.cpu().data).numpy(),(predictions.cpu().data).numpy().shape)
#                     print((self.labelList.cpu().data).numpy(), (self.labelList.cpu().data).numpy().shape)
                correct = ((predictions.cpu().data).numpy()==(self.labelList.cpu().data).numpy())
#                     print(correct)
                correctAll += correct.sum()
                nAll += np.shape(correct.squeeze())[0] if np.size(correct)!=1 else 1 
                correctRoot += correct.squeeze()[-1] if np.size(correct)!=1 else correct[-1]
#                     print(correct.squeeze()[-1] if np.size(correct)!=1 else correct[-1])
#                     print('actual: {}'.format(tree.root.label))
                for i in range(self.numClasses):
                    size = np.size((predictions.cpu().data).numpy())
                    if size!=1:
                        pred = (predictions.cpu().data).numpy().squeeze()[-1]
                        actual = (self.labelList.cpu().data).numpy().squeeze()[-1]
                    else:
                        pred = (predictions.cpu().data).numpy()[-1]
                        actual = (self.labelList.cpu().data).numpy()[-1]
                    if pred==i and actual==i:
                        tp[i]+=1
                    elif pred==i and actual!=i:
                        fn[i]+=1
                    elif pred==i and actual!=i:
                        fp[i]+=1
                n += 1
                pbar.update(j)
#             print(tp,fp,fn)
            for i in range(self.numClasses):
                p =(1.0*tp[i]/(tp[i]+fp[i]))
                r =(1.0*tp[i]/(tp[i]+fn[i]))
                f1[i] = (2*p*r)/(p+r)
            pbar.finish()
            return correctRoot / n, correctAll/nAll, f1

    def eval_sent_lvl(self,trees,clf):
        pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(trees)).start()
        n = nAll = correctRoot = correctAll = 0.0
        X_predict = []
        Y_gold = []
        for j, tree in enumerate(trees):
            tree_rep = model.getRep(tree.root)
            X_predict.append(tree_rep)
            Y_gold.append(tree.root.label)
        acc = clf.score(np.array(X_predict),np.array(Y_gold))
        return acc

In [9]:
file_trees = 'economic_balanced_over.pkl'

In [10]:
def make_tree(text):
#     print(text)
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    output = literal_eval(output)
    try:
        tree = str(output['sentences'][0]['parse'])
    except:
        print(output,text)
        return
    # print (tree)
    parse_string = ' '.join(str(tree).split())
    # print(parse_string)
    # print ("\n\n")
    tree = nltk.tree.Tree.fromstring(parse_string)
    tree.chomsky_normal_form()
    tree.collapse_unary(collapseRoot=True,collapsePOS=True)
    nt = convertNLTK_tree(tree)
    return nt

def printLabelTree(tree):
    def inorder(node,nnode):
        if node.isLeaf:
            newnode = pptree.Node('H',nnode)
            wnode = pptree.Node(node.word,newnode)
        elif nnode is not None:
            newnode = pptree.Node('H',nnode)
            inorder(node.left,newnode)
            inorder(node.right,newnode)
        elif node.isRoot():
            newnode = pptree.Node('H')
            inorder(node.left,newnode)
            inorder(node.right,newnode)
            return newnode
        return None
    pptree.print_tree(inorder(tree.root,None))

def create_trees_using_df(df):
    tree = []
    for tokens in list(df['tokens']):
        if len(tokens)==0:
            continue
        line = ' '.join(tokens)
        line += '\n'
        tree.append(make_tree(line))
    return tree

def printlabel(root,l):
    if root:
        l.append(root.label)
#         print(root.label)
        if root.left:
            l+=printlabel(root.left,[])
#             print(printlabel(root.left))
        if root.right:
            l+=printlabel(root.right,[])
#             print(printlabel(root.right))
    return l

In [11]:
[pro_test,anti_test] = pickle.load(open(os.path.join('./trees',file_trees+"_test"),'rb'))

In [12]:
for pro_tree in pro_test:
    pro_tree.root.set_label('pro')
for anti_tree in anti_test:
    anti_tree.root.set_label('anti')

In [13]:
def combine(neutral,non_neutral):
    trees = []
    trees.extend(neutral)
    trees.extend(non_neutral)
    random.shuffle(trees)
    return trees

In [14]:
from utils import *

In [15]:
from mytree import *
from treeUtil import *
# from pycorenlp import StanfordCoreNLP

val = {'pro':0,'anti':1,'default':-1}

# pro:1, anti:0, neutral:2
val_all = {'pro':1,'anti':0,'default':-1,'neutral':2}
# neutral:0, non-neutral:1
val_neutral = {'neutral':0, 'non_neutral':1, 'default':-1}


def convert(T):
    label = val[T.label] if (hasattr(T,'label')) else None
    print(label)
    newTree = convert_primary_new(T,label)
    annotate_all(newTree)

    return newTree

def convert_neutral(T):
    label = val_neutral[T.label] if (hasattr(T,'label')) else None
    print(label)
    newTree = convert_primary_new(T,label)
    annotate_all(newTree)

    return newTree

def convert_all(T):
    label = val_all[T.label] if (hasattr(T,'label')) else None
    print(label)
    newTree = convert_primary_new(T,label)
    annotate_all(newTree)

    return newTree


def convert_primary(T):
    if (hasattr(T,'label')):
        print(T.label) 
    label = val[T.label] if (hasattr(T,'label')) else None
    # label = val[T.label] if (hasattr(T,'label') ) else None # changed for ignoring neutral

    if isinstance(T,leafObj):
        newTree = Node(label,T.word,T.pos)
        newTree.isLeaf = True
        return newTree
    else:
        newTree = Node(label)
    
    leftChild = convert_primary(T.c1)
    rightChild = convert_primary(T.c2)
    leftChild.parent = newTree
    rightChild.parent = newTree

    newTree.left = leftChild
    newTree.right = rightChild

    return newTree

def convert_primary_new(T,label):
    # from IPython import embed; embed()
    if T is None:
        return None
    # label = val[T.label] if (hasattr(T,'label') ) else None # changed for ignoring neutral
    T.set_label(label)
    # if (T.isLeaf) : print (T.word)

    T.left = convert_primary_new(T.left,label)
    T.right = convert_primary_new(T.right,label)

    return T

    # if T.isLeaf:
    #     newTree = Node(label,T.word,T.pos)
    #     newTree.isLeaf = True
    #     return newTree
    # else:
    #     newTree = Node(label)
    
    # leftChild = convert_primary_new(T.left)
    # rightChild = convert_primary_new(T.right)
    # leftChild.parent = newTree
    # rightChild.parent = newTree

    # newTree.left = leftChild
    # newTree.right = rightChild

    # return newTree

def convertNLTK_tree_primary(tree):
    if tree.height()==2:
        newTree = Node('default',tree[0],None)
        newTree.isLeaf = True
        return newTree
    newTree = Node('default')
    leftChild = convertNLTK_tree_primary(tree[0])
    rightChild = convertNLTK_tree_primary(tree[1])
    
    leftChild.parent = newTree
    rightChild.parent = newTree

    newTree.left = leftChild
    newTree.right = rightChild

    return newTree

def convertNLTK_tree(tree):
    return Tree(convertNLTK_tree_primary(tree))




def annotate_all(T):
    if T == None: return
    if T.label != None : 
        T.annotated = True
    else:
        T.annotated = False
        T.set_label(T.parent.label)
    annotate_all(T.left)
    annotate_all(T.right)

def buildBalTree(sent):
    words = sent.split(' ')

    nodes = words

    while len(nodes)>1:
        temp = []
        for i in range(0,len(nodes),2):
            lChild = Node(None,nodes[i],None) if isinstance(nodes[i],str) else nodes[i]
            if i+1<len(nodes):
                rChild = Node(None,nodes[i+1],None) if isinstance(nodes[i+1],str) else nodes[i+1]
            else:
                rChild = None
            if isinstance(nodes[i],str):
                lChild.isLeaf = True
                if rChild is not None:
                    rChild.isLeaf = True
            newNode = Node(None)
            lChild.parent = newNode
            newNode.left = lChild
            newNode.right = rChild
            if rChild is not None:
                rChild.parent = newNode
            temp.append(newNode)
        nodes=temp
    return Tree(nodes[0])

def readFile2Trees(filename):
    trees = []
    with open(filename,'r') as file:
        for line in file:
            if line=='\n':
                continue
            else:
                [labelname,sent] = line.split(': ',1)
                tree = buildBalTree(sent)
                tree.root.set_label(val[labelname])
                if val[labelname]!=2:
                    trees.append(tree)
    return trees

In [16]:
trees_test = combine(pro_test,anti_test)
data_test = []
for i in range(len(trees_test)):
    data_test.append(Tree(convert(trees_test[i].root)))

0
0
1
0
0
0
0
1
0
0
1
1
0
1
0
1
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
1
1
0
1
1
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
0
0
0
1
0
1
0
0
0
0
0
1
0
0
0
1
0
0
1
1
0
0
0
0
1
1
0
0
0
0
0
1
0
1
0
0
0
0
1
0
0
0
0
0
1
0
0
0
1
1
1
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
1
1
1
1
0
1
1
0
0
0
1
0
1
1
0
0
1
1
0
0
1
0
0
0
1
1
0
0
1
1
1
1
1
1
0
1
0
0
0
1
0
0
1
0
0
1
1
0
0
0
1
1
1
0
0
1
0
0
0
0
1
0
1
1
0
0
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
1
0
0
1
1
0
0
1
0
0
0
0
0
1
0
0
0
1
0
0
1
0
0
0
0
0
1
0
0
1
1
1
1
0
1
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
1
0
1
1
0
0
1
0
1
0
1
0
0
0
1
1
1
0
0
0
1
1
1
1
0
1
0
1
0
1
1
1
1
1
1
0
0
0
0
0
1
1
1
1
1
0
0
1
0
1
0
0
1
0
0
1
0
1
0
1
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
1
0
0
1
0
0
1
1
0
1
0
0
0
0
0
1
1
1
0
1
1
0
0
0
0
0
1
0
0
1
0
1
1
0
0
0
0
0
0
1
0
1
1
0
0
0
1
0
1
0
1
0
0
1
0
1
0
1
0
0
0
1
0
0
0
1
0
0
0


In [17]:
model = pickle.load(open("/Users/navreetkaur/MTP/tech_classifier/models/"+namecode+'.pkl','rb'))

In [19]:
correctRoot, correctAll, f1 = model.evaluate(data_test)

100% |##########################################################| Time: 0:00:07


In [20]:
print("Validation all nodes accuracy: ", correctAll)

Validation all nodes accuracy:  0.5488566217732884


In [21]:
print("Validation root accuracy: ", correctRoot)

Validation root accuracy:  0.6431818181818182


In [22]:
print("Validation root accuracy: ", f1)

Validation root accuracy:  [0.8433459159906835, 0.6488624244578742]
