# Extract Subject-Verb-Object (S, V, O) Triplets

In [1]:
import pandas as pd
import numpy as np
import ast
import os
# change your java path and add it as an environment variable
java_path = "usr/bin/java/java.exe"
os.environ['JAVAHOME'] = java_path

## Load python wrapper for Stanford CoreNLP

- StanfordNLP homepage: https://stanfordnlp.github.io/CoreNLP/

- You can find OpenIE options from here: https://stanfordnlp.github.io/CoreNLP/openie.html#description  

In [2]:
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('/home/junhyuki/DLproject/ExternalLib/stanford-corenlp-full-2018-02-27',
                      memory='8g')
# We will use OpenIE (Open Information Extraction)
props={'annotators': 'tokenize, ssplit, pos, lemma, depparse, natlog, openie',
       'pipelineLanguage':'en',
       'outputFormat':'json',                        # one of {json, xml, text}
       'openie.format': 'default',    # One of {reverb, ollie, default, qa_srl}
       'openie.triple.strict': 'true',
       'openie.affinity_probability_cap': '1',
       'openie.max_entailments_per_clause': '1000',   # default = 1000
       }

In [3]:
dat_path = '/home/junhyuki/DLproject/DAT'
test = pd.read_csv(dat_path + '/1-DailyNews/cleaned_news_2012.csv', nrows=20)

## option test: openie.triple.strict = True
If true, extract triples only if they consume the entire fragment. This is useful for ensuring that only logically warranted triples are extracted, but puts more burden on the entailment system to find minimal phrases (see -max_entailments_per_clause).

In [4]:
for i in range(len(test)):
    text = test['clean_title'].iloc[i]
    print('------------------------------------------------------------------------')
    print('Target Sentence: ' + text)
    openie = nlp.annotate(text, properties=props)
    openie = ast.literal_eval(openie)              # convert str to dict
    for j in range(len(openie["sentences"][0]["openie"])):
        svo = openie["sentences"][0]["openie"][j]
        print((svo['subject'], svo['relation'], svo['object']))

------------------------------------------------------------------------
Target Sentence: Oakland Raiders fan falls from bleachers after game
('Oakland Raiders fan', 'falls after', 'game')
('Oakland Raiders fan', 'falls from', 'bleachers')
------------------------------------------------------------------------
Target Sentence: Blues' Cole suspended for three games
("Blues ' Cole", 'suspended for', 'three games')
------------------------------------------------------------------------
Target Sentence: Kostitsyn's hat-trick helps Predators douse Flames
('Kostitsyn', 'has', 'hat-trick')
('Predators', 'douse', 'Flames')
------------------------------------------------------------------------
Target Sentence: Broncos clinch AFC West, snap playoff drought
('Broncos', 'clinch', 'AFC West')
------------------------------------------------------------------------
Target Sentence: Pope Benedict XVI to visit Cuba March 26-28
-----------------------------------------------------------------------

## option test: openie.splitter.disable = True

Don’t split clauses at all, and only extract relations centered around the root verb.

> cons: Becuase of strong constraints, triplets often cannot be extracted from the target sentence

In [5]:
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('/home/junhyuki/DLproject/ExternalLib/stanford-corenlp-full-2018-02-27',
                      memory='8g')
# We will use OpenIE (Open Information Extraction)
props={'annotators': 'tokenize, ssplit, pos, lemma, depparse, natlog, openie',
       'pipelineLanguage':'en',
       'outputFormat':'json',                        # one of {json, xml, text}
       'openie.format': 'default',    # One of {reverb, ollie, default, qa_srl}
       'openie.triple.strict': 'true',
       'openie.affinity_probability_cap': '1',
       'openie.max_entailments_per_clause': '1000',   # default = 1000
       'openie.splitter.disable': 'true'
       }

In [6]:
for i in range(len(test)):
    text = test['clean_title'].iloc[i]
    print('------------------------------------------------------------------------')
    print('Target Sentence: ' + text)
    openie = nlp.annotate(text, properties=props)
    openie = ast.literal_eval(openie)              # convert str to dict
    for j in range(len(openie["sentences"][0]["openie"])):
        svo = openie["sentences"][0]["openie"][j]
        print((svo['subject'], svo['relation'], svo['object']))

------------------------------------------------------------------------
Target Sentence: Oakland Raiders fan falls from bleachers after game
------------------------------------------------------------------------
Target Sentence: Blues' Cole suspended for three games
------------------------------------------------------------------------
Target Sentence: Kostitsyn's hat-trick helps Predators douse Flames
('Kostitsyn', 'has', 'hat-trick')
------------------------------------------------------------------------
Target Sentence: Broncos clinch AFC West, snap playoff drought
------------------------------------------------------------------------
Target Sentence: Pope Benedict XVI to visit Cuba March 26-28
------------------------------------------------------------------------
Target Sentence: China dissident-lawyer Gao jailed in far west
------------------------------------------------------------------------
Target Sentence: Govt appoints 3-month interim head for India's NMDC
('India

## Extract SVO by using 'multiprocessing'

I decided to use options below
```python
props={'annotators': 'tokenize, ssplit, pos, lemma, depparse, natlog, openie',
       'pipelineLanguage':'en',
       'outputFormat':'json',                        # one of {json, xml, text}
       'openie.format': 'default',    # One of {reverb, ollie, default, qa_srl}
       'openie.triple.strict': 'true',
       'openie.affinity_probability_cap': '1',
       'openie.max_entailments_per_clause': '1000',   # default = 1000
       }
```

And for the purpose of high speed, I decided to use `multiprocessing` library in python.

> Detailed Code can be found from **`ExtractSVO_multi.py`**!

In [None]:
exec(open("/home/junhyuki/DLproject/PSC/2-ExtractSVO/ExtractSVO_multi.py").read())

# of Logical Processors : 20
Processing "2015" news data ...
original dataframe rows: 794284 -> after delete: 794284
loading data & position index >>> putting them into the queue
10000 th (S,V,O) extracted / 18.828808693091073 Minutes
20000 th (S,V,O) extracted / 36.84289424022039 Minutes
30000 th (S,V,O) extracted / 54.63262990315755 Minutes
40000 th (S,V,O) extracted / 72.84746944506963 Minutes
50000 th (S,V,O) extracted / 91.55406383275985 Minutes
60000 th (S,V,O) extracted / 111.03648264010748 Minutes
70000 th (S,V,O) extracted / 130.90460683902106 Minutes
80000 th (S,V,O) extracted / 151.10771512190502 Minutes
90000 th (S,V,O) extracted / 171.176607020696 Minutes
100000 th (S,V,O) extracted / 191.80335882902145 Minutes
110000 th (S,V,O) extracted / 213.33673591216404 Minutes
120000 th (S,V,O) extracted / 235.255088977019 Minutes
130000 th (S,V,O) extracted / 257.0998281637828 Minutes
140000 th (S,V,O) extracted / 279.48598434527713 Minutes
150000 th (S,V,O) extracted / 302.2639316