In [1]:
import collections
import spacy
import operator
from nltk.corpus import wordnet as wn
nlp = spacy.load('en')

In [2]:
_NNJJ_ = {'JJ','JJR','JJS','NN','NNP','NNS','ADJ','NOUN'} #Nouns and Adjectives
_VBRB_ = {'RB','RBR','RBS','RP','VB','VBD','VBG','VBN','VBP','VBZ','ADV','VERB'} #Verbs and Adverbs
_PUNC_ = {'.',',','?','!',';',':','(',')','[',']','{','}','"','\''} #Punctuation
_EXCL_ = {'SP','-RRB-','HYPH'} #Noise to skip

In [12]:
def parse(filename):
    txt = open('../../' + filename,"r").read()
    doc = nlp(txt)
    return doc

In [4]:
def lemmatize(token):
    lemma = token.lower_
    if token.tag_ in _NNJJ_:
        lemma = token.lemma_
    elif token.tag_ in _VBRB_:
        lemma = token.lemma_
    return lemma

In [5]:
def adj_to_noun(lem):
    for i in wn.synsets(lem, wn.ADJ):
        for j in i.lemmas():
            drf = j.derivationally_related_forms()
            drf.sort(key = lambda x: len(x.name()))
            for k in drf:
                if k.name()[:2] == lem[:2]:
                    #Found it!
                    return k.name()

def noun_to_adj(lem):
    for i in wn.synsets(lem, wn.NOUN):
        for j in i.lemmas():
            for k in j.derivationally_related_forms():
                if k.name()[:2] == lem[:2]:
                    #Found it!
                    return k.name()

In [6]:
def annotate(doc):
    groups = []
    for sent in doc.sents:
        terms = []
        for tok in sent:
            if tok.tag_ not in _EXCL_:
                lemma = lemmatize(tok)
                drf = None
                #Derive related form:
                if (tok.tag_ == 'JJ'):
                    drf = adj_to_noun(lemma)

                terms.append([tok.norm_,lemma,tok.tag_,drf])
                
        groups.append([sent,terms])
    return groups

In [7]:
def normalize(stack,origs):
    frm = []
    stack1 = []
    origs1 = []
    
    for tok in stack:
        val = tok[0]
        pos = tok[1]
        if len(tok)>2 and tok[2]:
            val = tok[2]
        if pos in _NNJJ_ or pos in _VBRB_:
            frm.append(val)
        if val not in _PUNC_:
            stack1.append(val)

    i = 0
    j = 0
    k = 0
    f = False
    for tok in origs:
        val = tok[0]
        pos = tok[1]
        j += 1
        if len(tok)>2 and tok[2]:
            val = tok[2]
        if val not in _PUNC_:
            origs1.append(val)
        if pos in _NNJJ_ or pos in _VBRB_:
            i = j
            if not f:
                k = j-1
            f = True
        
            
    origs1 = origs1[k:i]
        
    key = '_'.join(sorted(frm)).lower()
    txt = '_'.join(stack1).lower()
    org = ' '.join(origs1).lower()
    count = len(stack1)
    return key,txt,org,count

In [8]:
def skipchunk(sentences,maxslop=4,maxlength=4,concepts=dict(),predicates=dict(),triples=[]):
    
    stack = []
    origs = []
    
    def add(obj):
        nonlocal stack
        nonlocal origs
        key,txt,org,count = normalize(stack,origs)
        if count>1:
            if key not in obj:
                obj[key] = []
            obj[key].append([key,txt,org])
        stack=[]
        origs=[]
        return key
    
    def addtriple(sbj,prd,obj):
        triples.append({'subject':sbj,'predicate':prd,'object':obj})
    
    for sentence in sentences:
        tokens = sentence[1]

        stack = []
        origs = []
        
        isprd = False
        iscon = False

        currcon = None
        currprd = None
        lastcon = None
        lastprd = None
        
        last = 0
        slop = 0
                
        for tok in tokens:
            wrd = tok[0]
            lem = tok[1]
            pos = tok[2]
            drf = tok[3]

            if pos in _NNJJ_:
                if isprd:
                    currprd = add(predicates)
                    
                last  = 0
                isprd = False
                iscon = True
                stack.append([lem,pos,drf])
                origs.append([wrd,pos,drf])

                if len(stack)>=maxlength:
                    currcon = add(concepts)

            elif pos in _VBRB_:
                if iscon:
                    lastcon = currcon
                    currcon = add(concepts)
                    if lastcon and currcon and currprd:
                        addtriple(lastcon,currprd,currcon)
                        currcon = None
                        currprd = None
                        lastcon = None
                        lastprd = None
                    
                last  = 0
                isprd = True
                iscon = False
                stack.append([lem,pos])
                origs.append([wrd,pos])

            elif iscon:
                slop += 1
                last += 1

                origs.append([wrd,pos])

                if wrd in _PUNC_ or slop>maxslop:
                    lastcon = currcon
                    currcon = add(concepts)
                    if lastcon and currcon and currprd:
                        addtriple(lastcon,currprd,currcon)
                        currcon = None
                        currprd = None
                        lastcon = None
                        lastprd = None
                    
                    iscon = False

            elif isprd:
                slop += 1
                last += 1

                origs.append([wrd,pos])

                if wrd in _PUNC_ or slop>maxslop:
                    currprd = add(predicates)
                    isprd = False

    return concepts,predicates,triples

In [9]:
def printgroup(data):
    for item in data:
        if len(data[item])>1:
            print('--------------------------------------------')
            print(item)
            group = map(lambda x:x[2],data[item])
            coll = collections.Counter(group)
            for c in coll.most_common():
                print('\t',c[1],c[0])

In [10]:
def sortgroup(data):
    items = []
    for item in data:
        if len(data[item])>1:
            group = map(lambda x:x[2],data[item])
            coll = collections.Counter(group)
            total = 0
            pref = None
            for c in coll.most_common():
                if not pref:
                    pref = c[0]
                total += c[1]
            items.append([total,item,pref])
    return sorted(items, key=lambda x:x[0])

In [15]:
compscifiles = [
    'content/keyword-extraction-datasets/wiki20/documents/10894.txt',
    'content/keyword-extraction-datasets/wiki20/documents/12049.txt',
    'content/keyword-extraction-datasets/wiki20/documents/13259.txt',
    'content/keyword-extraction-datasets/wiki20/documents/16393.txt',
    'content/keyword-extraction-datasets/wiki20/documents/18209.txt',
    'content/keyword-extraction-datasets/wiki20/documents/19970.txt',
    'content/keyword-extraction-datasets/wiki20/documents/20782.txt',
    'content/keyword-extraction-datasets/wiki20/documents/23267.txt',
    'content/keyword-extraction-datasets/wiki20/documents/23507.txt',
    'content/keyword-extraction-datasets/wiki20/documents/23596.txt',
    'content/keyword-extraction-datasets/wiki20/documents/25473.txt',
    'content/keyword-extraction-datasets/wiki20/documents/287.txt',
    'content/keyword-extraction-datasets/wiki20/documents/37632.txt',
    'content/keyword-extraction-datasets/wiki20/documents/39172.txt',
    'content/keyword-extraction-datasets/wiki20/documents/39955.txt',
    'content/keyword-extraction-datasets/wiki20/documents/40879.txt',
    'content/keyword-extraction-datasets/wiki20/documents/43032.txt',
    'content/keyword-extraction-datasets/wiki20/documents/7183.txt',
    'content/keyword-extraction-datasets/wiki20/documents/7502.txt',
    'content/keyword-extraction-datasets/wiki20/documents/9307.txt'   
]

greatexpectations = [
    'content/great_expectations.txt'
]

concepts = dict()
predicates = dict()
triples = []
for file in compscifiles:
#for file in greatexpectations:
    doc = parse(file)
    sentences = annotate(doc)
    concepts,predicates,triples = skipchunk(sentences,concepts=concepts,predicates=predicates,triples=triples)

In [47]:
printgroup(concepts)

--------------------------------------------
regression_selection_technique_test
	 6 regression test selection techniques
	 2 regression test selection technique
--------------------------------------------
knowledge_priority
	 3 priority knowledge
--------------------------------------------
newness_regression_technique_test
	 2 newness technique for regression test
--------------------------------------------
select_test
	 2 select tests
--------------------------------------------
certain_condition
	 2 certain conditions
--------------------------------------------
set_test
	 5 test sets
	 3 set of tests
	 1 test set
	 1 test from the set
--------------------------------------------
algorithm_select
	 2 algorithms select
--------------------------------------------
original_suite_test
	 3 original test suite
--------------------------------------------
algorithm_selection
	 2 selection algorithms
--------------------------------------------
many_otherness_regression_test
	 2 many ot

	 2 parse specification
--------------------------------------------
correctness_parser
	 1 correctness parsers
	 1 parser correctness
--------------------------------------------
parser_precc
	 2 precc parser
--------------------------------------------
concreteness_specification
	 2 concreteness specification
--------------------------------------------
sameness_way
	 3 sameness way
--------------------------------------------
default_lexer
	 2 default lexer
--------------------------------------------
precc_script
	 1 precc script
	 1 precc scripts
--------------------------------------------
more_time
	 2 more times
	 1 more time
--------------------------------------------
declare_semantic
	 2 declare semantics
--------------------------------------------
effect_side
	 4 side effects
	 2 side effect
--------------------------------------------
production_rule
	 2 production rules
--------------------------------------------
emptiness_string
	 2 emptiness string
-------------------

	 1 value input
--------------------------------------------
balance_load
	 6 load balance
--------------------------------------------
prematureness_study
	 1 prematureness studies
	 1 prematureness study
--------------------------------------------
completion_sorting
	 2 completion of sorting
--------------------------------------------
ffl_step
	 8 ffl step
--------------------------------------------
pi_processor
	 11 processor pi
--------------------------------------------
number_point
	 4 point numbers
	 1 point number
--------------------------------------------
pj_processor
	 4 processor pj
--------------------------------------------
pp_processor
	 4 processor pp
--------------------------------------------
binary_search
	 2 binary search
--------------------------------------------
less_value
	 2 value less
--------------------------------------------
element_th
	 3 th element
--------------------------------------------
input_pi_processor
	 2 input at processor pi
---------

--------------------------------------------
richness_structure
	 2 richness structure
--------------------------------------------
document_fullness_ijcai-95_text
	 3 fullness text documents ijcai-95
--------------------------------------------
web_wide_world
	 2 world wide web
--------------------------------------------
avail_information
	 1 information avail
	 1 avail information
--------------------------------------------
factor_more
	 3 more factors
--------------------------------------------
observe_variable
	 3 observe variables
--------------------------------------------
cluster_document
	 2 cluster of documents
	 1 documents in one cluster
--------------------------------------------
value_vary
	 2 vary values
--------------------------------------------
keyword_otherness
	 3 otherness keywords
	 1 keyword and the otherness
--------------------------------------------
relevance_task
	 1 relevance to a task
	 1 relevance to their task
---------------------------------------

	 3 explanation patterns
	 1 explanation pattern
--------------------------------------------
method_particularity_reasoning
	 1 particularity reasoning methods
	 1 particularity reasoning method
--------------------------------------------
introspect_meta_xps
	 2 introspect meta xps
--------------------------------------------
assignment_blame
	 2 blame assignment
--------------------------------------------
error_reasoning
	 2 reasoning error
--------------------------------------------
cause_explanation
	 2 cause explanations
--------------------------------------------
animateness_object
	 2 animateness objects
--------------------------------------------
process_understanding
	 2 understanding process
--------------------------------------------
composite_meta_xp
	 2 composite meta xp
--------------------------------------------
novelsituation_xp
	 3 xp novelsituation
--------------------------------------------
indexedstructure_mis_xp
	 2 xp mis indexedstructure
-----------------

In [33]:
printgroup(predicates)

--------------------------------------------
be_deem
	 3 is deemed
	 2 are deemed
	 1 were deemed
--------------------------------------------
be_have_modify
	 2 had been modified
	 1 has been modified
	 1 is modified to have
--------------------------------------------
have_implement
	 4 have implemented
--------------------------------------------
reduce_significantly
	 2 significantly reduce
--------------------------------------------
account_as
	 2 account for as
--------------------------------------------
be_have_propose
	 2 have been proposed
	 1 has been proposed
--------------------------------------------
be_provide
	 4 is to provide
	 3 be provided
	 2 are provided
	 1 was provided
--------------------------------------------
be_particularly
	 3 is particularly
	 2 be particularly
--------------------------------------------
be_concern
	 4 is concerned
	 1 be concerned
	 1 was concerned
--------------------------------------------
be_require
	 6 is required
	 2 be required


	 8 so on
--------------------------------------------
be_hold
	 2 is held
	 1 be held
--------------------------------------------
be_treat
	 3 be treated
	 1 is treated
--------------------------------------------
assign_be
	 1 was assigned
	 1 are assigned
	 1 were assigned
--------------------------------------------
be_remarkably
	 1 is remarkably
	 1 be remarkably
--------------------------------------------
have_mean
	 1 has meant
	 1 means that we will have
--------------------------------------------
be_do
	 5 be done
	 2 was done
	 2 is done
	 1 were done
	 1 be doing
--------------------------------------------
be_develop
	 1 were developed
	 1 was developed
	 1 is that it develops
	 1 are developing
	 1 was to develop
--------------------------------------------
as_well
	 30 as well
--------------------------------------------
be_publish
	 1 be published
	 1 were published
--------------------------------------------
be_implement
	 3 was implemented
	 3 are implemented
	 3 

	 1 have been incorporated
--------------------------------------------
apply_be_then
	 1 then be applied
	 1 then it is applied
--------------------------------------------
as_much
	 4 as much
--------------------------------------------
infrequently_update
	 2 updated infrequently
--------------------------------------------
automatically_be_generate
	 2 be generated automatically
--------------------------------------------
apply_consistently
	 2 applied consistently
--------------------------------------------
acquire_know
	 2 know or acquire
--------------------------------------------
apply_have
	 1 have to apply
	 1 have applied
--------------------------------------------
be_determine_use
	 1 is determined using
	 1 was determined using
--------------------------------------------
be_learn
	 1 be learned
	 1 learn is
--------------------------------------------
be_so
	 1 are so
	 1 so there is
--------------------------------------------
be_contain
	 3 is contained
	 1 is a con

In [56]:
for t in sorted(triples, key=lambda t:t['subject']):
    sub=t['subject']
    prd=t['predicate']
    obj=t['object']
    if sub in concepts and prd in predicates and obj in concepts:
        sbjlabel = concepts[sub][0][2]
        prdlabel = predicates[prd][0][2]
        objlabel = concepts[obj][0][2]
        print(sbjlabel,'->',prdlabel,'->',objlabel)

absolute equality -> were passing on -> greater judgment
absurdity way -> has been -> such person
excuse into an acceptance -> so changed -> few words
least account of “ the one -> not to make -> delicate face
mercy on account -> was recommended -> goodness character
accurate knowledge -> had obtained -> magwitch ’s affairs
badness heart ache -> got out -> worse heart ache
additional relish -> was eating -> independence circumstances
advance another two hundred yards -> had not -> inexpressible terror
adversary ’s head -> have flung -> entertainer ’s
submit to a word of advice -> has been -> constancy terror
world affairs -> began to wear -> gloom appearance
afternoon ’s bustle -> opening more -> red eyes
summer afternoon -> toned down -> summer evening
sunday afternoon -> washed up -> tea things
tacit understanding that the aged -> n’t be -> presentable state
air of authority -> not to be disputed -> manner express
littleness alick in a frock -> having already made -> arrangements for

sameness inflame process -> always have put -> similarity circumstances
part of our intercourse -> then thought -> longness time
interestingness relics -> had taken -> few days
light iron stairs -> went out -> gallery highness overhead
mr. jaggers -> coming up -> due time
mr. jaggers -> sent up -> miss havisham
mr. jaggers -> showed that she had struggled -> greatness lot
mr. jaggers -> was altogether too -> many for the jury
mr. jaggers -> perhaps i know more -> estella ’s history
sureness of this unconsciousness on mr. jaggers -> n’t be -> ’s part
mrs. joe -> has been -> dozen times
mrs. joe -> was very -> cleanness housekeeper
mrs. joe -> was soon landed -> uncle pumblechook
joe and orlick -> sweeping up -> otherness traces of discomposure
tenderness of joe -> was so beautifully -> proportioned to my need
thanklessness to joe -> had never been struck at so keenly -> brass impostor pumblechook
thing in joe -> soon arrived -> sorrowful comprehension
tiresomeness journey -> had had -> 

In [16]:
cons = sortgroup(concepts)
for c in cons:
    print(c)

[2, 'newness_regression_technique_test', 'newness technique for regression test']
[2, 'select_test', 'select tests']
[2, 'certain_condition', 'certain conditions']
[2, 'algorithm_select', 'algorithms select']
[2, 'algorithm_selection', 'selection algorithms']
[2, 'many_otherness_regression_test', 'many otherness regression test']
[2, 'modification_program_type', 'types of program modifications']
[2, 'development_testing', 'development testing']
[2, 'modified_program', 'modified program']
[2, 'clemson_university', 'clemson university']
[2, 'component_program', 'program components']
[2, 'coverage_criterion', 'coverage criteria']
[2, 'environment_programming', 'programming environment']
[2, 'currency_practice', 'currency practice']
[2, 'additional_test', 'additional tests']
[2, 'many_technique', 'many techniques']
[2, 'algorithms_otherness_safeness', 'otherness safeness algorithms']
[2, 'few_test', 'fewer tests']
[2, 'process_regression_testing', 'regression testing process']
[2, 'interpr