In [29]:
import numpy as np
import pandas as pd
import sys
import random
import progressbar
import torch
import pickle
from mytree import *
from utils import *
from treeUtil import *
import tqdm
import argparse
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import functools
from sklearn.linear_model import LogisticRegression
import gensim
from gensim.models import KeyedVectors
import string
from stanfordcorenlp import StanfordCoreNLP
import nltk.tree
from ast import literal_eval
import pptree
import logging

In [2]:
from nltk.tokenize import word_tokenize
import re
import os
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [3]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy.pipeline import merge_entities
nlp = en_core_web_sm.load()
nlp.add_pipe(merge_entities)

In [4]:
import copy
from sklearn.utils import shuffle
from sklearn import metrics

In [5]:
proanti = True # True if doing pro/anti (ideology classification); False if doing nuetral/non-neutral classification (Step1: Stance Detection)
ner = True # if replacing named entities with their entity tags
blackout = False # if blacking out the named entities
balanced = True # if dataset has to be balanced or not
undersample = True # True if undersampling, False if oversampling
test = False # False if proccessing data for training, True otherwise
two_step = True # True if two-step classification (pro/anti or neutral/non-neutral); false if three-label classification (pro/anti/neutral)

In [6]:
# set name of the pickle file where trees will be stored 
file_trees = 'trees.pkl'

## Data Preprocessing

In [7]:
corpus_path = '/Users/navreetkaur/MTP/data/'
if not test:
    if not two_step:
        pro_file = os.path.join(corpus_path,'pro.txt')
        anti_file = os.path.join(corpus_path,'anti.txt')
        neutral_file = os.path.join(corpus_path,'neutral.txt')
    else:
        if proanti:
            pro_file = os.path.join(corpus_path,'pro.txt')
            anti_file = os.path.join(corpus_path,'anti.txt')
        else:
            neutral_file = os.path.join(corpus_path,'neutral.txt')
            non_neutral_file = os.path.join(corpus_path,'non_neutral.txt')
else:
    policy_file = os.path.join(corpus_path,'test.txt')

In [8]:
def replace_entity(sent):
    doc = nlp(sent)
    for ent in doc.ents:
        if ent.label_=='PERSON':
            sent = sent.replace(ent.text,ent.label_)
    return sent

def blackout_entity(sent):
    doc = nlp(sent)
    for ent in doc.ents:
        if ent.label_=='PERSON':
            sent = sent.replace(ent.text,"")
    return sent

def read_into_string(filename):
    text_file = open(filename, 'r')
    lines = text_file.read().split('\n')
    if ner:
        lines = [replace_entity(sent) for sent in lines]
    if blackout:
        lines = [blackout_entity(sent) for sent in lines]
    text_file.close()
    return lines

def process(file):
    sents = read_into_string(file)
    sents = [s for s in sents if len(s)>14 and len(s)<500]
    return sents

In [9]:
pro = anti = neutral = non_neutral = policy = []
if test:
    policy = process(policy_file)
else:
    if not two_step:
        pro = process(pro_file) #1
        anti = process(anti_file) #0
        neutral = process(neutral_file) #2
    if proanti:
        pro = process(pro_file) #1
        anti = process(anti_file) #0
    else:
        neutral = process(neutral_file) #0
        non_neutral = process(non_neutral_file) #1

In [10]:
len(pro), len(anti), len(neutral), len(non_neutral), len(policy)

(10, 3, 0, 0, 0)

Run the following command on terminal in the stanford-corenlp-full-2018-10-05 directory

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 90000

In [11]:
nlp = StanfordCoreNLP('http://localhost', port=9000,timeout=90000)

In [12]:
def make_tree(text):
#     print(text)
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    output = literal_eval(output)
    try:
        tree = str(output['sentences'][0]['parse'])
    except:
        print(output,text)
        return
    # print (tree)
    parse_string = ' '.join(str(tree).split())
    # print(parse_string)
    # print ("\n\n")
    tree = nltk.tree.Tree.fromstring(parse_string)
    tree.chomsky_normal_form()
    tree.collapse_unary(collapseRoot=True,collapsePOS=True)
    nt = convertNLTK_tree(tree)
    return nt

def printLabelTree(tree):
    def inorder(node,nnode):
        if node.isLeaf:
            newnode = pptree.Node('H',nnode)
            wnode = pptree.Node(node.word,newnode)
        elif nnode is not None:
            newnode = pptree.Node('H',nnode)
            inorder(node.left,newnode)
            inorder(node.right,newnode)
        elif node.isRoot():
            newnode = pptree.Node('H')
            inorder(node.left,newnode)
            inorder(node.right,newnode)
            return newnode
        return None
    pptree.print_tree(inorder(tree.root,None))

def create_trees_using_df(df):
    tree = []
    for tokens in list(df['tokens']):
        if len(tokens)==0:
            continue
        line = ' '.join(tokens)
        line += '\n'
        tree.append(make_tree(line))
    return tree

def printlabel(root,l):
    if root:
        l.append(root.label)
#         print(root.label)
        if root.left:
            l+=printlabel(root.left,[])
#             print(printlabel(root.left))
        if root.right:
            l+=printlabel(root.right,[])
#             print(printlabel(root.right))
    return l

In [14]:
if test:
    df = pd.DataFrame({'statement':policy})
    df['tokens'] = [word_tokenize(re.sub(r'[^\w\s]|[\d]+',' ',sent)) for sent in df['statement']]
    pickle.dump(df,open("./trees/test/df_test",'wb'))
    tree = create_trees_using_df(df)
    fout = open("./trees/test",'wb')
    pickle.dump([tree],fout)
    fout.close()

Splitting and balancing dataset for training

In [15]:
def split(dataset, label, train_test_split = 0.85):
    random.shuffle(dataset)
    idx = int(len(dataset) * train_test_split)
    train = dataset[: idx]
    test = dataset[idx + 1:]
    y_train = [label for x in range(len(train))]
    y_test = [label for x in range(len(test))]
    return (train, y_train), (test, y_test)

def balance_classes(X_train, y_train):
    dic = {}
    minm = 10000000
    for x, y in zip(X_train, y_train):
        if y in dic.keys():
            dic[y] = (1 + dic[y][0], [x] + dic[y][1])
        else:
            dic[y] = (1, [x])
    for k in dic.keys():
        if dic[k][0] < minm:
            minm = dic[k][0]
    X_train = []
    y_train = []
    for k in dic.keys():
        X_train += dic[k][1][: minm]
        y_train += [k for x in range(minm)]
    return shuffle(X_train, y_train)

def upsample(X_train, y_train):
    dic = {}
    minm = 10000000
    maxm = -minm
    X_final = []
    y_final = []
    # create the dictionary
    for x, y in zip(X_train, y_train):
        if y in dic.keys():
            dic[y] = (1 + dic[y][0], [x] + dic[y][1])
        else:
            dic[y] = (1, [x])
    # find maximum and minimum
    for k in dic.keys():
        count = dic[k][0]
        if dic[k][0] < minm:
            minm = count
        if dic[k][0] > maxm:
            maxm = count
    # Upsample all the non-majority classes
    for k in dic.keys():
        count = dic[k][0]
        examples = dic[k][1]
        if count < maxm:
            examples = resample(examples, 
                                          replace=True, # sample with replacement
                                          n_samples=maxm, # match number in majority class
                                          random_state=27) # reproducible results
            assert len(examples) == maxm
        X_final += examples
        y_final += [k for x in range(len(examples))]
    return shuffle(X_final, y_final)

In [16]:
train_test_split = 0.85

In [18]:
if not two_step:
    if proanti:
        (pro_X_train, pro_y_train), (pro_X_test, pro_y_test) = split(copy.deepcopy(pro), 1, train_test_split=train_test_split)
        (anti_X_train, anti_y_train), (anti_X_test, anti_y_test) = split(copy.deepcopy(anti), 0, train_test_split=train_test_split)
        (neutral_X_train, neutral_y_train), (neutral_X_test, neutral_y_test) = split(copy.deepcopy(neutral), 2, train_test_split=train_test_split)
else:
    if proanti:
        (pro_X_train, pro_y_train), (pro_X_test, pro_y_test) = split(copy.deepcopy(pro), 1, train_test_split=train_test_split)
        (anti_X_train, anti_y_train), (anti_X_test, anti_y_test) = split(copy.deepcopy(anti), 0, train_test_split=train_test_split)
    else:
        (neutral_X_train, neutral_y_train), (neutral_X_test, neutral_y_test) = split(copy.deepcopy(neutral), 0, train_test_split=train_test_split)
        (non_neutral_X_train, non_neutral_y_train), (non_neutral_X_test, non_neutral_y_test) = split(copy.deepcopy(non_neutral), 1, train_test_split=train_test_split)

In [19]:
if not two_step:
    # pro v/s anti v/s neutral classification
    # pro:1, anti:0, neutral:2
    X_train = pro_X_train + anti_X_train + neutral_X_train
    y_train = pro_y_train + anti_y_train + neutral_y_train
    X_test = pro_X_test + anti_X_test + neutral_X_test
    y_test = pro_y_test + anti_y_test + neutral_y_test
else:
    if proanti:
        # pro v/s anti classification
        # pro:1, anti:0
        X_train = pro_X_train + anti_X_train
        y_train = pro_y_train + anti_y_train
        X_test = pro_X_test + anti_X_test
        y_test = pro_y_test + anti_y_test
    else:
        # neutral v/s non_neutral classification
        # non_neutral:1, neutral:0
        X_train = neutral_X_train + non_neutral_X_train
        y_train = neutral_y_train + non_neutral_y_train
        X_test = neutral_X_test + non_neutral_X_test
        y_test = neutral_y_test + non_neutral_y_test
X_train, y_train = shuffle(X_train, y_train)
X_test, y_test = shuffle(X_test, y_test)

In [20]:
if balanced:
    if undersample:
        X_train, y_train = balance_classes(X_train, y_train)
    else:
        X_train, y_train = upsample(X_train, y_train)

In [21]:
df_train = pd.DataFrame({'statement':X_train, 'target':y_train})
df_test = pd.DataFrame({'statement':X_test, 'target':y_test})

In [22]:
df_train.groupby('target').count(), df_test.groupby('target').count() # check counts of each class

(        statement
 target           
 0               2
 1               2,         statement
 target           
 1               1)

In [23]:
df_train['tokens'] = [word_tokenize(re.sub(r'[^\w\s]|[\d]+',' ',sent)) for sent in df_train['statement']]
df_train = df_train[['tokens','target']]
df_test['tokens'] = [word_tokenize(re.sub(r'[^\w\s]|[\d]+',' ',sent)) for sent in df_test['statement']]
df_test = df_test[['tokens','target']]

In [24]:
pickle.dump(df_train,open("./trees/df_"+file_trees,'wb'))

In [30]:
if not two_step:
    pro_trees = create_trees_using_df(df_train[df_train.target == 1])
    anti_trees = create_trees_using_df(df_train[df_train.target == 0])
    neutral_trees = create_trees_using_df(df_train[df_train.target == 2])
    pro_test_trees = create_trees_using_df(df_test[df_test.target == 1])
    anti_test_trees = create_trees_using_df(df_test[df_test.target == 0])
    neutral_test_trees = create_trees_using_df(df_test[df_test.target == 2])
    fout = open("./trees/"+file_trees,'wb')
    pickle.dump([pro_trees, anti_trees, neutral_trees],fout)
    fout.close()
    fout = open("./trees/"+file_trees+"_test",'wb')
    pickle.dump([pro_test_trees, anti_test_trees, neutral_test_trees],fout)
    fout.close()
else:
    if proanti:
        pro_trees = create_trees_using_df(df_train[df_train.target == 1])
        anti_trees = create_trees_using_df(df_train[df_train.target == 0])
        pro_test_trees = create_trees_using_df(df_test[df_test.target == 1])
        anti_test_trees = create_trees_using_df(df_test[df_test.target == 0])
        fout = open("./trees/"+file_trees,'wb')
        pickle.dump([pro_trees, anti_trees],fout)
        fout.close()
        fout = open("./trees/"+file_trees+"_test",'wb')
        pickle.dump([pro_test_trees, anti_test_trees],fout)
        fout.close()
    else:
        neutral = create_trees_using_df(df_train[df_train.target == 0])
        non_neutral = create_trees_using_df(df_train[df_train.target == 1])
        neutral_test = create_trees_using_df(df_test[df_test.target == 0])
        non_neutral_test = create_trees_using_df(df_test[df_test.target == 1])
        fout = open("./trees/"+file_trees,'wb')
        pickle.dump([neutral, non_neutral],fout)
        fout.close()
        fout = open("./trees/"+file_trees+"_test",'wb')
        pickle.dump([neutral_test, non_neutral_test],fout)
        fout.close()