In [34]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import neuralcoref
import nltk
import json
from collections import Counter
from allennlp.predictors.predictor import Predictor
import csv
import re
from nltk.corpus import stopwords 

stop_words = stopwords.words('english')
stop_words.extend(",")
inputList = []
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)
predictor = Predictor.from_path("/home/raj/UB_Stuff/CSE_635/Akshaya_code/srl-model-2018.05.25.tar.gz")

In [35]:
class OrderedCounter(Counter, OrderedDict):
    pass

In [36]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        candidate_pos.extend("party")
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        retList = []
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            #print(key + ' - ' + str(value))
            retList.append(key)
            if i > number:
                break     
        return retList
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        
    def getCoref(self, text):
        doc = nlp(text)
        text = doc._.coref_resolved 
        return text

In [5]:
def getData(text): 
      
    retVal = {}
    #Text Rank KeyWords
    tr4w = TextRank4Keyword()
    text = tr4w.getCoref(text)
    tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    keyWords = tr4w.get_keywords(7)
    retVal.update({'textrank':keyWords})
    #retVal['textrank']
    print('textrank done')
    #Split into sentences
    sent_text = nltk.sent_tokenize(text)
    for sentence in sent_text:
        data = {}
        data["sentence"] = sentence
        inputList.append(data)
        
    #Semantic Labeling Predector
    pred = predictor.predict_batch_json(inputList)
    print('prediction done')
    
    arg0list = []
    arg1list = []
    argloclist = []
    argtmplist = []
    #Iterate over each sentence
    for x in range(len(pred)):
        #Iterate over each verb in each sentence
        verbList = pred[x]['verbs']
        for verb in verbList:
            tags = verb['tags']
            #if ('B-ARG1' in tags or 'I-ARG1'in tags) and ('B-ARG0' in tags or 'I-ARG0' in tags):
            if ('B-ARG1' in tags or 'I-ARG1'in tags) and ('B-ARG0' in tags or 'I-ARG0' in tags):
                descr = verb['description']
                if 'ARG0' in descr:
                    s = descr.find('ARG0')+5
                    d = descr.find(']')
                    arg0list.append(descr[s:d])
                if 'ARG1' in descr:
                    s = descr.find('ARG1')+5
                    d = descr.find(']')
                    arg1list.append(descr[s:d]) 

            if 'B-ARGM-LOC' in tags or 'I-ARGM-LOC' in tags:
                descr = verb['description']
                if 'ARGM-LOC' in descr:
                    s = descr.find('ARGM-LOC')+9
                    d = descr.find(']')
                    argloclist.append(descr[s:d])


    print('semantic parsing done')
    arg0 = []
    for word in arg0list:
        w = ""
        wordlist = word.split()
        for x in wordlist:
            if x in keyWords:
                w += " "+x
        #print(word)
        arg0.extend(w.split())
        #print(arg0)
        counts = Counter(arg0)
    #print(counts)


    arg1 = []
    for word in arg1list:
        w = ""
        wordlist = word.split()
        for x in wordlist:
            if x in keyWords:
                w += " "+x
        #print(word)
        arg1.extend(w.split())
        #print(arg1)
        counts.update(arg1)
    #print(counts)

    argloc = []
    for word in argloclist:
        word = word.lower()
        w = ""
        wordlist = word.split()
        for x in wordlist:
            if x not in stop_words:
                w += " "+x
        #print(word)
        argloc.extend(w.split())
        #print(arg1)
        countsLoc = Counter(argloc)
    #print(counts)
    
#     sorted_words = sorted(counts, key: lambda x:-counterlist[x])
#     distinct_words_from_list = set(list_to_be_sorted)
#     sorted_distinct_list = sorted(distinct_words_from_list, key: lambda x:-counterlist[x])
#     sorted_distinct_list = sorted_distinct_list[:10]
    print('Trying to add to counter')
    print(counts)
    counterlist = OrderedCounter(counts)
    counterlist = counterlist.keys()
    counterlist = list(counterlist)
    counterlist = counterlist[:5]
    retVal.update({'semanticLabeling':counterlist})
    
    counterlist = OrderedCounter(countsLoc)
    counterlist = counterlist.keys()
    counterlist = list(counterlist)
    counterlist = counterlist[:5]
    retVal.update({'semanticLoc':counterlist})
    print('Added successfully to counter')
    return retVal

In [None]:

counter = 1
with open('/home/raj/UB_Stuff/CSE_635/Phase5/Semantic_word_labelling/SRL_vs_TLDR.csv','r') as csvinput:
    with open('/home/raj/UB_Stuff/CSE_635/Phase5/Semantic_word_labelling/test.csv', 'w') as csvoutput:
        writer = csv.writer(csvoutput, lineterminator='\n')
        reader = csv.reader(csvinput)

        all = []
        row = next(reader)
        row.append('Text Rank')
        row.append('Semantic Text Rank')
        row.append('Semantic Loc')
        all.append(row)

        for row in reader:
            try:
                if(counter == 6) :
                    break
                searchTerm = row[14]
                print(searchTerm)
                searchTerm = searchTerm.replace('\n',' ')
                #Get data
                data = getData2(searchTerm)        
                row.append(data['textrank'])
                row.append(data['semanticLabeling'])
                row.append(data['semanticLoc'])
                all.append(row)
                counter+=1
            except Exception as e:
                print(e)
                pass
        writer.writerows(all)


The announcement of tickets by the BJP has led to discontent among a section of its workers who burnt effigies of former MP Sangita Singh Deo and her husband and BJP Legislature Party Leader KV Singh Deo. After the party preferred a turncoat, Ashok Pujari, who recently came from the Congress to the BJP and who is uncle of BJD candidate Niranjan Pujari for the Sonepur Assembly seat, and Raghunath Jagdala, an officer of the Merchant Navy, who also came to BJP 20 days ago for the Birmaharajpur Seat, supporters of BJP leaders Baldev Bedbak and Anand Barik, who were seeking tickets for the two seats, respectively, held an emergency meeting and charged the party high command for ignoring dedicated workers and giving preference to relatives of rival candidates. More than 450 workers burnt effigies of Singh Deo couple at Sonepur bus stand on Sunday, alleging that KV had taken over Rs 2 crore from opposition parties. The auudio has been also viral in the social media in which someone is giving 

In [37]:
def getData2(text): 
      
    retVal = {}
    #Text Rank KeyWords
    tr4w = TextRank4Keyword()
    text = tr4w.getCoref(text)
    tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    keyWords = tr4w.get_keywords(7)
    retVal.update({'textrank':keyWords})
    #retVal['textrank']
    #print('textrank done')
    #Split into sentences
    sent_text = nltk.sent_tokenize(text)
    for sentence in sent_text:
        data = {}
        data["sentence"] = sentence
        inputList.append(data)
        
    #Semantic Labeling Predector
    pred = predictor.predict_batch_json(inputList)
    #print('prediction done')
    
    arg0list = []
    arg1list = []
    argloclist = []
    argtmplist = []
    #Iterate over each sentence
    for x in range(len(pred)):
        #Iterate over each verb in each sentence
        verbList = pred[x]['verbs']
        for verb in verbList:
            tags = verb['tags']
            #if ('B-ARG1' in tags or 'I-ARG1'in tags) and ('B-ARG0' in tags or 'I-ARG0' in tags):
            if ('B-ARG1' in tags or 'I-ARG1'in tags) and ('B-ARG0' in tags or 'I-ARG0' in tags):
                descr = verb['description']
                if 'ARG0' in descr:
                    s = descr.find('ARG0')+5
                    d = descr.find(']')
                    arg0list.append(descr[s:d])
                if 'ARG1' in descr:
                    s = descr.find('ARG1')+5
                    d = descr.find(']')
                    arg1list.append(descr[s:d]) 

            if 'B-ARGM-LOC' in tags or 'I-ARGM-LOC' in tags:
                descr = verb['description']
                if 'ARGM-LOC' in descr:
                    s = descr.find('ARGM-LOC')+9
                    d = descr.find(']')
                    argloclist.append(descr[s:d])


    print('semantic parsing done')

    arg0 = []
    for word in arg0list:
        w = ""
        wordlist = word.split()
        for x in wordlist:
                w += " "+x
        #print(word)
        arg0.extend(w.split())
        #print(arg0)
        counts = Counter(arg0)
    #print(counts)


    arg1 = []
    for word in arg1list:
        w = ""
        wordlist = word.split()
        for x in wordlist:
                w += " "+x
        #print(word)
        arg1.extend(w.split())
        #print(arg1)
        counts.update(arg1)
    print(counts)

    argloc = []
    for word in argloclist:
        word = word.lower()
        w = ""
        wordlist = word.split()
        for x in wordlist:
            if x not in stop_words:
                w += " "+x
        #print(word)
        argloc.extend(w.split())
        #print(arg1)
        countsLoc = Counter(argloc)
    #print(counts)
    
#     sorted_words = sorted(counts, key: lambda x:-counterlist[x])
#     distinct_words_from_list = set(list_to_be_sorted)
#     sorted_distinct_list = sorted(distinct_words_from_list, key: lambda x:-counterlist[x])
#     sorted_distinct_list = sorted_distinct_list[:10]
#     print('Trying to add to counter')
#     print(counts)
    counterlist = OrderedCounter(counts)
    counterlist = counterlist.keys()
    counterlist = list(counterlist)
    counterlist = counterlist[:7]
    retVal.update({'semanticLabeling':counterlist})
    
    counterlist = OrderedCounter(countsLoc)
    counterlist = counterlist.keys()
    counterlist = list(counterlist)
    counterlist = counterlist[:5]
    retVal.update({'semanticLoc':counterlist})
#     print('Added successfully to counter')
    return retVal

In [38]:

text = r'As the elections nearing, the political scenario in Begunia block of Khordha disdtrict seems to be getting warmer by the day. On Monday, two groups of the BJD organised political gatherings at two different places under the block on the plea of a protest meeting and a Mahashanti Yajna , respectively. This has been a subject of discussion among local people now. While supporters of the local MLA held a protest meeting in the Begunia Mini Stadium in protest against the ransacking of the police station by some rivals of the MLA a week back, former Minister and Rajya Sabha Member Prashant Nanda s son Rushabh along with thousands of his supporters from about 40 gram panchayats attended the Mahashanti Yajna at Brahmeswar Peeth in Lakhanpur village. MLA supporters had come in a rally from Sarua to reach the mini stadium. Among others, BYJD Begunia president Bibhuti Mohanty, BYJD Bolgarh president Rabindra Natha Subuddhi, Bolgarh block president Gopal Behera attended. Those who attended Rushabh s meeting included Prabhat Kumar Maharaj, former Sarpanch Chaitanya Jaysingh, Krushna Chandra Ranabijuli, Hemant Sundray, Surendra Behera, Sangramkeshari Mishra, Dillip Mahabhoi and Jalandhar Mohanty.'
res = getData2(text)


semantic parsing done
Counter({'supporters': 2, 'MLA': 2, 'of': 1, 'the': 1, 'local': 1, 'Those': 1})


In [14]:
print(type(Counter))

<class 'type'>
