In [2]:
# Import libraries required
import re
import nltk
from nltk.tag.perceptron import PerceptronTagger

In [3]:
# Noun phrase chunking
chunk_patterns=r'''NP:{<DT>?<JJ.*>*<NN.*>+}
                      {<NN.*>+}
                '''
#chunk parser
nounphrase_chunker=nltk.RegexpParser(chunk_patterns)
# Hearst Patterns
hearst_patterns=[ (
                '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '(such NP_\\w+ (, )?as (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '((NP_\\w+ ?(, )?)+(and |or )?other NP_\\w+)',
                'last'
            ),
            (
                '(NP_\\w+ (, )?include (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '(NP_\\w+ (, )?especially (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                    '((NP_\\w+ ?(, )?)+(and |or )?any other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?some other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?be a NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?like (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    'such (NP_\\w+ (, )?as (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?like other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of the NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of these NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of those NP_\\w+)',
                    'last'
                ),
                (
                    'example of (NP_\\w+ (, )?be (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?be example of NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?for example (, )?'
                    '(NP_\\w+ ?(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which be call NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which be name NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?mainly (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?mostly (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?notably (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?particularly (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?principally (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?in particular (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?except (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?other than (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?e.g. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ \\( (e.g.|i.e.) (, )?(NP_\\w+ ? (, )?(and |or )?)+'
                    '(\\. )?\\))',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?i.e. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? a kind of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? kind of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? form of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which look like NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which sound like NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?which be similar to (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?example of this be (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?type (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )? NP_\\w+ type)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?whether (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(compare (NP_\\w+ ?(, )?)+(and |or )?with NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?compare to (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?among -PRON- (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?as NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )? (NP_\\w+ ? (, )?(and |or )?)+ '
                    'for instance)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? sort of NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?which may include (NP_\\w+ '
                    '?(, )?(and |or )?)+)',
                    'first'
                ),]

In [4]:
# Part of Speech tagging
pos_tagger=PerceptronTagger()

In [5]:
# Text Preprocessing
def prepare(raw_text):
    sentences=nltk.sent_tokenize(raw_text.strip())
    sentences=[nltk.word_tokenize(sent) for sent in sentences]
    sentences=[pos_tagger.tag(sent) for sent in sentences]
    return sentences

In [16]:
def chunk(rawtext):
    sentences=prepare(rawtext.strip())
    all_chunks=[]
    for sentence in sentences:
        chunks=nounphrase_chunker.parse(sentence)
        all_chunks.append(prepare_chunks(chunks))
    # If we have more than 1 consecutive Noun phrase, we merge into one single noun phrase
    all_sentences=[]
    for raw_sentence in all_chunks:
        sentence = re.sub(r"(NP_\w+ NP_\w+)+",
                              lambda m: m.expand(r'\1').replace(" NP_", "_"),
                              raw_sentence)
        all_sentences.append(sentence)

    return all_sentences

In [17]:
# Method to replace occurrences in text that are noun phrases and make them start with NP_<token> or just keep it as is
def prepare_chunks(chunks):
    terms=[]
    for chunk in chunks:
        label=None
        try:
            label=chunk.label()
        except:
            pass
        if label is None:
            token=chunk[0]
            terms.append(token)
        else:
            np='NP_'+'_'.join([a[0] for a in chunk])
            terms.append(np)
    return ' '.join(terms)

In [8]:
# Test preprocessing
ex_corpus='I like music like rock and metal. I think countries like Iceland and Norway are beautiful. It is a good day to die.'
sentences=nltk.sent_tokenize(ex_corpus)
sentences=[nltk.word_tokenize(sent) for sent in sentences]
sentences=[pos_tagger.tag(sent) for sent in sentences]
chunk=nounphrase_chunker.parse(sentences[0])
np_tagged_sentence=prepare_chunks(chunk)

In [9]:
# for chunko in chunk:
#     print(chunko[1])
# for (hearst_pattern,parser) in hearst_patterns:
#     print(hearst_pattern)
matches=re.search(list(hearst_patterns[8])[0],np_tagged_sentence)

In [10]:
match_str=matches.group(0)
[a for a in match_str.split() if a.startswith('NP_')]

['NP_music', 'NP_rock', 'NP_metal']

In [18]:
def remove_np_term(term):
    return term.replace('NP_','').replace('_',' ')

In [19]:
def find_hyponyms(rawtext):
    hypo_hypernyms=[]
    np_tagged_sentences=chunk(rawtext)

    for sentence in np_tagged_sentences:
        for (hearst_pattern,parser) in hearst_patterns:
            matches=re.search(hearst_pattern,sentence)
            if matches:
                match_str=matches.group(0)
                nps=[a for a in match_str.split() if a.startswith('NP_')]

                if parser=='first':
                    hypernym=nps[0]
                    hyponyms=nps[1:]
                else:
                    hypernym=nps[-1]
                    hyponyms=nps[:-1]
                for i in range(len(hyponyms)):
                    hypo_hypernyms.append((remove_np_term(hyponyms[i]),remove_np_term(hypernym)))
    return hypo_hypernyms

In [20]:
data=prepare(ex_corpus)
all_chunks=[]

In [22]:
find_hyponyms(ex_corpus)

[('rock', 'music'),
 ('metal', 'music'),
 ('Iceland', 'countries'),
 ('Norway', 'countries')]

In [23]:
# Apply Hearst Patterns to Wikipedia Files
def extractHearstWikipedia(input):
    extractions=[]
    lines_processed=0

    with open(input,'r') as f:
        for  line in f:
            lines_processed+=1
            line=line.strip()
            if not line:
                continue
            line_split=line.split('\t')
            sentence,lemma_sent=line_split[0].strip(),line_split[1].strip()
            hypo_hyper_pairs=find_hyponyms(sentence)
            extractions.append(hypo_hyper_pairs)

            if lines_processed%1000==0:
                print('Lines Processed: {}'.format(lines_processed))

    return extractions


In [24]:
hypo_hyper_pairs=extractHearstWikipedia('/home/shaurya/Documents/lexicalinference/wikipedia_sentences.txt')

Lines Processed: 1000
Lines Processed: 2000
Lines Processed: 3000
Lines Processed: 4000
Lines Processed: 5000
Lines Processed: 6000
Lines Processed: 7000
Lines Processed: 8000
Lines Processed: 9000
Lines Processed: 10000
Lines Processed: 11000
Lines Processed: 12000
Lines Processed: 13000
Lines Processed: 14000
Lines Processed: 15000
Lines Processed: 16000
Lines Processed: 17000
Lines Processed: 18000
Lines Processed: 19000
Lines Processed: 20000
Lines Processed: 21000
Lines Processed: 22000
Lines Processed: 23000
Lines Processed: 24000
Lines Processed: 25000
Lines Processed: 26000
Lines Processed: 27000
Lines Processed: 28000
Lines Processed: 29000
Lines Processed: 30000
Lines Processed: 31000
Lines Processed: 32000
Lines Processed: 33000
Lines Processed: 34000
Lines Processed: 35000
Lines Processed: 36000
Lines Processed: 37000
Lines Processed: 38000
Lines Processed: 39000
Lines Processed: 40000
Lines Processed: 41000
Lines Processed: 42000
Lines Processed: 43000
Lines Processed: 440

In [38]:
# Write Hypernyms to a file
with open('/home/shaurya/Documents/lexicalinference/wikipedia_hypernyms.txt','w') as f:
    for (hypo,hyper) in hypo_hypernyms:
        try:
            f.write(hypo+'\t'+hyper+'\n')
        except ValueError:
            pass


In [31]:
hypo_hyper_pairs[:10]

[[],
 [],
 [('the guitar serve', 'companions')],
 [],
 [('troops', 'a section')],
 [],
 [],
 [],
 [],
 []]

In [32]:
hypo_hypernyms=[x for x in hypo_hyper_pairs if x!=[]]


In [36]:
import itertools
hypo_hypernyms=list(itertools.chain.from_iterable(hypo_hypernyms))

In [11]:
# Method to post process extracted noun pairs as hypernyms and hyponyms
# They contain multi-word noun phrases 
# We bring it down to the head pf the NP which is usually the last token and lemmatize
from nltk.stem.wordnet import WordNetLemmatizer
lemma=WordNetLemmatizer()
def post_process(text):
    text=text.split()
    length=len(text)
    return lemma.lemmatize(text[length-1])


In [6]:
#text='the manual musicians'
#text.split()[len(text.split())-1]

'musicians'

In [12]:
# Convert the Extractions from Wikipedia into a dictionary
hyperhypo={}
with open('/home/shaurya/Documents/lexicalinference/wikipedia_hypernyms.txt','r') as f:
    text=f.read().strip().split('\n')
    for line in text:
        hyponym,hypernym=line.split('\t')   
        hyponym,hypernym=hyponym.lower(),hypernym.lower()
        hyponym=post_process(hyponym)
        hypernym=post_process(hypernym)
        hyperhypo[hypernym]=hyponym


In [13]:
hyperhypo

,
 'dog': 'door',
 'gas': 'bottle',
 'product': 'blech',
 'dj': 'industry',
 'coach': 'arthurson',
 'guitar': 'career',
 'ferals': 'ferals',
 'pradesh': 'etc',
 'label': 'magrudergrind',
 'attire': 'robe',
 'bulb': 'onion',
 'presenter': 'career',
 'uniform': 'uniform',
 'attraction': 'mill',
 'alice': 'shop',
 'bird': 'bird',
 'team': 'success',
 'handling': 'case',
 'shepherd': 'summer',
 'piano': 'composer',
 'hillman': 'version',
 'organization': 'schalkwyk',
 'science': 'subject',
 'pilot': 'aviation',
 'approach': 'barbara',
 'revision': 'remote',
 'type': 'hemlock',
 'proscenium': 'act',
 'punishment': 'room',
 'sensor': 'act',
 'anchor': 'joining',
 'competition': 'competition',
 'ryan': 'indoor',
 'grenade': 'm9',
 'leader': 'role',
 'rifle': 'tool',
 'litter': 'kennel',
 'reward': 'respect',
 'merchant': 'business',
 'museum': 'danmark',
 'kat': 'puppeteer',
 'defence': 'correggio',
 'commander': 'dowding',
 'amenity': 'lab',
 'whip': 'position',
 'field': 'information',
 'ga