In [1]:
# Import stanza
import stanza
stanza.install_corenlp(dir=corenlp_dir)

# Setting StanfordCoreNLP
import os
import stanfordnlp
import pandas as pd
from tqdm import tqdm
corenlp_dir = './corenlp'
os.environ["CORENLP_HOME"] = corenlp_dir
from stanfordnlp.server import CoreNLPClient

# Setting AllenNLP
from allennlp.predictors.predictor import Predictor
from allennlp_models.coref.predictors.coref import CorefPredictor
from allennlp_models.coref.dataset_readers.conll import ConllCorefReader
print('\nInitialising AllenNLP Library models')
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")
print('\nDone initialising AllenNLP Library models!\n')

# Initialising CoreNLPClient
print('\nInitialising CoreNLPClient')
nlp = CoreNLPClient(annotators=['ner'], memory='4G', endpoint='http://localhost:9001', output_format='json')
print('\nDone initialising CoreNLPClient!\n')

# Input text
text = "The 1Malaysia Development Berhad scandal (1MDB scandal) is an ongoing political scandal occurring in Malaysia. In 2015, Malaysia's then Prime Minister Najib Razak  was accused of channelling over RM 2.67 billion (approximately US $700 million) from 1Malaysia Development Berhad (1MDB), a government run strategic development company (masterminded by Low Taek Jho, commonly referred to as Jho Low), to his personal bank accounts. The event triggered widespread criticism among Malaysians, with many calling for Najib Razak's resignation including Mahathir Mohamad, one of Najib Razak's predecessors as Prime Minister, who later defeated Najib Razak in the 2018 general election and returned to power. Anwar Ibrahim, a political leader in opposition to Najib Razak, openly questioned 1MDB's credentials. He told Parliament that, according to records held by the Companies Commission, the company has no business address and no appointed auditor. According to its publicly filed accounts, 1MDB had nearly RM 42 billion (US $11.73 billion) in debt. Some of this debt resulted from a $3 billion state guaranteed 2013 bond issue led by the investment bank Goldman Sachs, who have been reported as receiving fees of up to $300 million for the deal, although the bank disputes this figure. The Malaysian Conference of Rulers called for prompt investigation of the scandal, saying that it was causing a crisis of confidence in Malaysia. After the 2018 election, the newly elected Prime Minister, Mahathir Mohamad, reopened the investigation into the 1MDB scandal. Malaysian authorities barred Najib Razak from leaving the country, then seized cash and valuable items from premises linked to him. Najib Razak was charged with criminal breach of trust, money laundering and abuse of power, while Jho Low was charged with money laundering. The US Department of Justice pursued its own investigation into 1MDB, alleging that more than US $4.5 billion was diverted from 1MDB by Jho Low and other conspirators including officials from Malaysia, Saudi Arabia and the United Arab Emirates. Najib Razak was subsequently found guilty of seven charges connected to SRC International, a dummy corporation associated with 1MDB, and was sentenced to twelve years imprisonment."

# Resolve text (Replace pronouns into references)
print('\nResolving Text')
allennlp_resolved = predictor.coref_resolved(text)
print('\nText is resolved!\n')

# Annotate texts with Named Entity Recognitions
print('\nAnnotating Resolved Text')
annotated = nlp.annotate(allennlp_resolved)
print('\nResolved Text is annotated!\n')

# Extract the annotated texts
print('\nExtracting Annotated Text')
annotated_sentences = annotated['sentences']
rows = []
for s in tqdm(range(len(annotated_sentences))):
    for r in range(s):
        try:
            entity = annotated_sentences[s]['entitymentions'][r]['text']
            start_index = annotated_sentences[s]['entitymentions'][r]['characterOffsetBegin']
            end_index = annotated_sentences[s]['entitymentions'][r]['characterOffsetEnd']
            subjectSpan = list([start_index, end_index])
            ner = annotated_sentences[s]['entitymentions'][r]['ner']
            nerconfidence = annotated_sentences[s]['entitymentions'][r]['nerConfidences'].values()
            #regexner = annotated_sentences[s]['tokens'][r]['ner']
            row = list([entity, subjectSpan, ner, nerconfidence])
            rows.append(row)
        except:
            pass
print('\nDone Extraction!\n')    

# View the output in a dataframe
entity_df = pd.DataFrame(rows, columns = ['Entity', 'SubjectSpan', 'NER', 'Confidence'])
entity_df


Initialising AllenNLP Library models


Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Did not use initialization regex that was passed: _context_layer._module.weight_ih.*
Did not use initialization regex that was passed: _context_layer._module.weight_hh.*



Done initialising AllenNLP Library models!

Initialising CoreNLPClient

Done initialising CoreNLPClient!


Resolving Text

Text is resolved!


Annotating Resolved Text
Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-fd09bd5d71ec4f37.props -preload ner


100%|██████████| 13/13 [00:00<00:00, 18272.77it/s]


Resolved Text is annotated!


Extracting Annotated Text

Done Extraction!






Unnamed: 0,Entity,SubjectSpan,NER,Confidence
0,2015,"[138, 142]",DATE,(0.95941944460213)
1,Malaysians,"[424, 434]",MISC,(0.99317849656546)
2,Malaysia,"[458, 466]",COUNTRY,(0.99651137635639)
3,Anwar Ibrahim,"[741, 754]",PERSON,(0.99939204674679)
4,Malaysia,"[792, 800]",COUNTRY,(0.9994433999054)
5,Anwar Ibrahim,"[898, 911]",PERSON,(0.99939204674986)
6,Najib Razak,"[949, 960]",PERSON,(0.99589418715533)
7,Parliament,"[966, 976]",ORGANIZATION,(0.96904096276024)
8,Development Berhad,"[1140, 1158]",ORGANIZATION,(0.81835820581105)
9,Development Berhad,"[1196, 1214]",ORGANIZATION,(0.74267153223559)
