In [1]:
import numpy as np 
import pandas as pd 

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 41.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [3]:
import transformers

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


In [5]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [6]:
example = "Agilent to acquire BioTek for $1.165B"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.99779296, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, {'entity': 'I-ORG', 'score': 0.9767228, 'index': 2, 'word': '##gi', 'start': 1, 'end': 3}, {'entity': 'I-ORG', 'score': 0.9949825, 'index': 3, 'word': '##lent', 'start': 3, 'end': 7}, {'entity': 'B-ORG', 'score': 0.9995247, 'index': 6, 'word': 'B', 'start': 19, 'end': 20}, {'entity': 'B-ORG', 'score': 0.93922216, 'index': 7, 'word': '##io', 'start': 20, 'end': 22}, {'entity': 'I-ORG', 'score': 0.99886537, 'index': 8, 'word': '##T', 'start': 22, 'end': 23}, {'entity': 'I-ORG', 'score': 0.9971501, 'index': 9, 'word': '##ek', 'start': 23, 'end': 25}]


In [7]:
example = "Amazon Invests $2B to Expand Operations in India"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.999134, 'index': 1, 'word': 'Amazon', 'start': 0, 'end': 6}, {'entity': 'I-ORG', 'score': 0.99855894, 'index': 2, 'word': 'In', 'start': 7, 'end': 9}, {'entity': 'I-ORG', 'score': 0.9901177, 'index': 3, 'word': '##ves', 'start': 9, 'end': 12}, {'entity': 'B-ORG', 'score': 0.90328836, 'index': 9, 'word': 'Ex', 'start': 22, 'end': 24}, {'entity': 'I-ORG', 'score': 0.5646667, 'index': 10, 'word': '##pan', 'start': 24, 'end': 27}, {'entity': 'I-ORG', 'score': 0.5163879, 'index': 12, 'word': 'Operations', 'start': 29, 'end': 39}, {'entity': 'B-LOC', 'score': 0.9994265, 'index': 14, 'word': 'India', 'start': 43, 'end': 48}]


In [8]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

In [9]:
def get_entities_bert(ner_results):
  entityList = []
  current_token = ''
  last_index = 0
  last_token_apostrophe = False

  #filter entites with less confidence
  confidence_score = 0.7
  filtered_results = list(filter(lambda x: x['score'] > confidence_score, ner_results))

  for entity in filtered_results: 
    if entity['word'].startswith('##'):
      current_token += entity['word'][2:]
      entityList[-1] = { 'token' : current_token, 'entity' : entity['entity']}
    elif entity['word'] == "'":
      last_token_apostrophe = True
      current_token += entity['word']
      entityList[-1] = { 'token' : current_token, 'entity' : entity['entity']}
    elif last_token_apostrophe == True:
      current_token += entity['word']
      entityList[-1] = { 'token' : current_token, 'entity' : entity['entity']}
      last_token_apostrophe = False
    elif ((entity['index'] - last_index) <= 1 and (last_index != 0)):
      current_token += ' '+ entity['word']
      entityList[-1] = { 'token' : current_token, 'entity' : entity['entity']}
    else:
      current_token = entity['word']
      entityList.append({ 'token' : current_token, 'entity' : entity['entity']})

    last_index = entity['index']  
    
  filter_one_letter_tokens = filter(lambda x: len(x['token']) > 1, entityList)
  return list(filter_one_letter_tokens)

In [10]:
example = "Agilent to acquire BioTek for $1.165B"
#example = example.lower()

ner_results = nlp(example)
print(ner_results)

entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.9995727, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, {'entity': 'I-ORG', 'score': 0.9944661, 'index': 2, 'word': '##gi', 'start': 1, 'end': 3}, {'entity': 'I-ORG', 'score': 0.9788536, 'index': 3, 'word': '##lent', 'start': 3, 'end': 7}, {'entity': 'B-ORG', 'score': 0.99958223, 'index': 6, 'word': 'B', 'start': 19, 'end': 20}, {'entity': 'I-ORG', 'score': 0.99762696, 'index': 7, 'word': '##io', 'start': 20, 'end': 22}, {'entity': 'I-ORG', 'score': 0.99851424, 'index': 8, 'word': '##T', 'start': 22, 'end': 23}, {'entity': 'I-ORG', 'score': 0.99901855, 'index': 9, 'word': '##ek', 'start': 23, 'end': 25}]
[{'token': 'Agilent', 'entity': 'I-ORG'}, {'token': 'BioTek', 'entity': 'I-ORG'}]


In [11]:
example = "Amazon invests $2B to Expand Operations in India"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.99749714, 'index': 1, 'word': 'Amazon', 'start': 0, 'end': 6}, {'entity': 'B-LOC', 'score': 0.999537, 'index': 13, 'word': 'India', 'start': 43, 'end': 48}]
[{'token': 'Amazon', 'entity': 'B-ORG'}, {'token': 'India', 'entity': 'B-LOC'}]


In [12]:
example = "Broadcom Corporation (BRCM) President & CEO Scott A Mcgregor sells 20,000 Shares"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.9990883, 'index': 1, 'word': 'Broad', 'start': 0, 'end': 5}, {'entity': 'I-ORG', 'score': 0.99768806, 'index': 2, 'word': '##com', 'start': 5, 'end': 8}, {'entity': 'I-ORG', 'score': 0.99908555, 'index': 3, 'word': 'Corporation', 'start': 9, 'end': 20}, {'entity': 'B-ORG', 'score': 0.9970906, 'index': 5, 'word': 'BR', 'start': 22, 'end': 24}, {'entity': 'I-ORG', 'score': 0.96153444, 'index': 6, 'word': '##CM', 'start': 24, 'end': 26}, {'entity': 'B-PER', 'score': 0.9992545, 'index': 11, 'word': 'Scott', 'start': 44, 'end': 49}, {'entity': 'I-PER', 'score': 0.9931559, 'index': 12, 'word': 'A', 'start': 50, 'end': 51}, {'entity': 'I-PER', 'score': 0.9991246, 'index': 13, 'word': 'M', 'start': 52, 'end': 53}, {'entity': 'I-PER', 'score': 0.95359385, 'index': 14, 'word': '##c', 'start': 53, 'end': 54}, {'entity': 'I-PER', 'score': 0.8601585, 'index': 15, 'word': '##g', 'start': 54, 'end': 55}, {'entity': 'I-PER', 'score': 0.6098421, 'index': 16, 'word': '##r

In [13]:
example = "Alternative Investment Allocations Should Increase In 2017 Across The Board Due To Market Uncertainty"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[]
[]


In [14]:
example = "Saba Capital Management, L.P. Buys Allergan PLC, Alpine Total Dynamic Dividend Fund,"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.9982614, 'index': 1, 'word': 'Sa', 'start': 0, 'end': 2}, {'entity': 'I-ORG', 'score': 0.99101406, 'index': 2, 'word': '##ba', 'start': 2, 'end': 4}, {'entity': 'I-ORG', 'score': 0.997468, 'index': 3, 'word': 'Capital', 'start': 5, 'end': 12}, {'entity': 'I-ORG', 'score': 0.998353, 'index': 4, 'word': 'Management', 'start': 13, 'end': 23}, {'entity': 'B-ORG', 'score': 0.86203694, 'index': 6, 'word': 'L', 'start': 25, 'end': 26}, {'entity': 'I-ORG', 'score': 0.8916233, 'index': 8, 'word': 'P', 'start': 27, 'end': 28}, {'entity': 'B-ORG', 'score': 0.99959606, 'index': 12, 'word': 'All', 'start': 35, 'end': 38}, {'entity': 'I-ORG', 'score': 0.9947069, 'index': 13, 'word': '##er', 'start': 38, 'end': 40}, {'entity': 'I-ORG', 'score': 0.9938067, 'index': 14, 'word': '##gan', 'start': 40, 'end': 43}, {'entity': 'I-ORG', 'score': 0.9990758, 'index': 15, 'word': 'P', 'start': 44, 'end': 45}, {'entity': 'I-ORG', 'score': 0.9984855, 'index': 16, 'word': '##LC', 's

In [15]:
example = "Bulldog Investors, LLC Buys Korea Equity Fund, Stellar Acquisition III Inc, New York REIT Inc, ..."

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.99873954, 'index': 1, 'word': 'Bull', 'start': 0, 'end': 4}, {'entity': 'I-ORG', 'score': 0.9968284, 'index': 2, 'word': '##dog', 'start': 4, 'end': 7}, {'entity': 'I-ORG', 'score': 0.9978801, 'index': 3, 'word': 'In', 'start': 8, 'end': 10}, {'entity': 'I-ORG', 'score': 0.9935121, 'index': 4, 'word': '##ves', 'start': 10, 'end': 13}, {'entity': 'I-ORG', 'score': 0.9955329, 'index': 5, 'word': '##tors', 'start': 13, 'end': 17}, {'entity': 'I-ORG', 'score': 0.9842046, 'index': 6, 'word': ',', 'start': 17, 'end': 18}, {'entity': 'I-ORG', 'score': 0.98660684, 'index': 7, 'word': 'LLC', 'start': 19, 'end': 22}, {'entity': 'B-ORG', 'score': 0.9990299, 'index': 10, 'word': 'Korea', 'start': 28, 'end': 33}, {'entity': 'I-ORG', 'score': 0.9975528, 'index': 11, 'word': 'Equity', 'start': 34, 'end': 40}, {'entity': 'I-ORG', 'score': 0.9977875, 'index': 12, 'word': 'Fund', 'start': 41, 'end': 45}, {'entity': 'B-ORG', 'score': 0.9991146, 'index': 14, 'word': 'Stella

In [16]:
example = "Bragar Eagel & Squire, P.C. Announces That a Class Action Lawsuit Has Been Filed Against ..."

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.98956376, 'index': 1, 'word': 'B', 'start': 0, 'end': 1}, {'entity': 'I-ORG', 'score': 0.913737, 'index': 2, 'word': '##raga', 'start': 1, 'end': 5}, {'entity': 'I-ORG', 'score': 0.95903605, 'index': 3, 'word': '##r', 'start': 5, 'end': 6}, {'entity': 'I-ORG', 'score': 0.99362236, 'index': 4, 'word': 'E', 'start': 7, 'end': 8}, {'entity': 'I-ORG', 'score': 0.8085601, 'index': 5, 'word': '##age', 'start': 8, 'end': 11}, {'entity': 'I-ORG', 'score': 0.9897614, 'index': 6, 'word': '##l', 'start': 11, 'end': 12}, {'entity': 'I-ORG', 'score': 0.99244225, 'index': 7, 'word': '&', 'start': 13, 'end': 14}, {'entity': 'I-ORG', 'score': 0.99576217, 'index': 8, 'word': 'S', 'start': 15, 'end': 16}, {'entity': 'I-ORG', 'score': 0.99170554, 'index': 9, 'word': '##quire', 'start': 16, 'end': 21}, {'entity': 'B-LOC', 'score': 0.6613107, 'index': 11, 'word': 'P', 'start': 23, 'end': 24}, {'entity': 'I-LOC', 'score': 0.6518834, 'index': 13, 'word': 'C', 'start': 25, 'end

In [17]:
example = "Marcato Capital Management Llc Buys Univar Inc, Astec Industries Inc, IAC/InterActiveCorp, ..."

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.99945956, 'index': 1, 'word': 'Marc', 'start': 0, 'end': 4}, {'entity': 'I-ORG', 'score': 0.998026, 'index': 2, 'word': '##ato', 'start': 4, 'end': 7}, {'entity': 'I-ORG', 'score': 0.998359, 'index': 3, 'word': 'Capital', 'start': 8, 'end': 15}, {'entity': 'I-ORG', 'score': 0.9987785, 'index': 4, 'word': 'Management', 'start': 16, 'end': 26}, {'entity': 'I-ORG', 'score': 0.9962297, 'index': 5, 'word': 'L', 'start': 27, 'end': 28}, {'entity': 'I-ORG', 'score': 0.9835028, 'index': 6, 'word': '##l', 'start': 28, 'end': 29}, {'entity': 'I-ORG', 'score': 0.9666137, 'index': 7, 'word': '##c', 'start': 29, 'end': 30}, {'entity': 'B-ORG', 'score': 0.9995315, 'index': 10, 'word': 'Un', 'start': 36, 'end': 38}, {'entity': 'I-ORG', 'score': 0.9983436, 'index': 11, 'word': '##iva', 'start': 38, 'end': 41}, {'entity': 'I-ORG', 'score': 0.9971168, 'index': 12, 'word': '##r', 'start': 41, 'end': 42}, {'entity': 'I-ORG', 'score': 0.9989749, 'index': 13, 'word': 'Inc', '

In [18]:
example = "New Purchases: UNVR, ASTE, THRM, Added Positions: IAC, RYAM, ITRI, FG, BLDR, VRTS, HZN, Reduced Positions: DXC, AIR, Sold Out: DECK, BWLD, BID, RCII"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.7807537, 'index': 7, 'word': 'UN', 'start': 15, 'end': 17}, {'entity': 'B-ORG', 'score': 0.5281742, 'index': 10, 'word': 'AS', 'start': 21, 'end': 23}, {'entity': 'B-ORG', 'score': 0.9116241, 'index': 24, 'word': 'I', 'start': 50, 'end': 51}, {'entity': 'B-ORG', 'score': 0.76980984, 'index': 27, 'word': 'R', 'start': 55, 'end': 56}, {'entity': 'I-ORG', 'score': 0.38277635, 'index': 28, 'word': '##Y', 'start': 56, 'end': 57}, {'entity': 'I-ORG', 'score': 0.508054, 'index': 29, 'word': '##AM', 'start': 57, 'end': 59}, {'entity': 'B-ORG', 'score': 0.8095285, 'index': 31, 'word': 'IT', 'start': 61, 'end': 63}, {'entity': 'I-ORG', 'score': 0.68610436, 'index': 32, 'word': '##RI', 'start': 63, 'end': 65}, {'entity': 'B-ORG', 'score': 0.52713543, 'index': 34, 'word': 'F', 'start': 67, 'end': 68}, {'entity': 'B-ORG', 'score': 0.4987074, 'index': 37, 'word': 'B', 'start': 71, 'end': 72}]
[{'token': 'UN', 'entity': 'B-ORG'}, {'token': 'IT', 'entity': 'B-ORG'}]


In [19]:
example = "Sun Pharma to develop chikungunya, zika drugs"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.99957067, 'index': 1, 'word': 'Sun', 'start': 0, 'end': 3}, {'entity': 'I-ORG', 'score': 0.99915195, 'index': 2, 'word': 'Ph', 'start': 4, 'end': 6}, {'entity': 'I-ORG', 'score': 0.9989931, 'index': 3, 'word': '##arma', 'start': 6, 'end': 10}, {'entity': 'I-MISC', 'score': 0.44486672, 'index': 9, 'word': '##un', 'start': 29, 'end': 31}, {'entity': 'I-MISC', 'score': 0.5967901, 'index': 10, 'word': '##ya', 'start': 31, 'end': 33}]
[{'token': 'Sun Pharma', 'entity': 'I-ORG'}]


In [20]:
example = "Walmart looks to exit Japan with Seiyu supermarket sale Nikkei"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.99753547, 'index': 1, 'word': 'W', 'start': 0, 'end': 1}, {'entity': 'I-ORG', 'score': 0.99241114, 'index': 2, 'word': '##al', 'start': 1, 'end': 3}, {'entity': 'I-ORG', 'score': 0.9910721, 'index': 3, 'word': '##mart', 'start': 3, 'end': 7}, {'entity': 'B-LOC', 'score': 0.9995981, 'index': 7, 'word': 'Japan', 'start': 22, 'end': 27}, {'entity': 'B-MISC', 'score': 0.83560604, 'index': 9, 'word': 'Se', 'start': 33, 'end': 35}, {'entity': 'I-MISC', 'score': 0.76857173, 'index': 10, 'word': '##i', 'start': 35, 'end': 36}, {'entity': 'I-MISC', 'score': 0.84839094, 'index': 11, 'word': '##yu', 'start': 36, 'end': 38}, {'entity': 'B-MISC', 'score': 0.9566216, 'index': 14, 'word': 'Nik', 'start': 56, 'end': 59}, {'entity': 'I-MISC', 'score': 0.9408264, 'index': 15, 'word': '##ke', 'start': 59, 'end': 61}, {'entity': 'I-MISC', 'score': 0.9859191, 'index': 16, 'word': '##i', 'start': 61, 'end': 62}]
[{'token': 'Walmart', 'entity': 'I-ORG'}, {'token': 'Japan', 'en

In [21]:
example = "Apple buys self driving car startup Drive.ai"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-ORG', 'score': 0.9945722, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'B-ORG', 'score': 0.9505366, 'index': 8, 'word': 'Drive', 'start': 36, 'end': 41}, {'entity': 'I-ORG', 'score': 0.5202128, 'index': 11, 'word': '##i', 'start': 43, 'end': 44}]
[{'token': 'Apple', 'entity': 'B-ORG'}, {'token': 'Drive', 'entity': 'B-ORG'}]


In [22]:
example = "Aaron's Acquires Crusader - Analyst Blog"

ner_results = nlp(example)
print(ner_results)


entityList = get_entities_bert(ner_results)
print(entityList)

[{'entity': 'B-PER', 'score': 0.71067655, 'index': 1, 'word': 'Aaron', 'start': 0, 'end': 5}, {'entity': 'I-ORG', 'score': 0.9187889, 'index': 2, 'word': "'", 'start': 5, 'end': 6}, {'entity': 'I-ORG', 'score': 0.91359866, 'index': 3, 'word': 's', 'start': 6, 'end': 7}, {'entity': 'B-MISC', 'score': 0.8616119, 'index': 8, 'word': 'Crusade', 'start': 17, 'end': 24}, {'entity': 'I-MISC', 'score': 0.94030327, 'index': 9, 'word': '##r', 'start': 24, 'end': 25}]
[{'token': "Aaron's", 'entity': 'I-ORG'}, {'token': 'Crusader', 'entity': 'I-MISC'}]


In [23]:
import spacy

In [24]:
nlp_pipeline = spacy.load("en_core_web_sm")

In [25]:
doc3 = nlp_pipeline("Sun Pharma to develop chikungunya, zika drugs")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Sun Pharma 0 2 0 10 ORG


In [26]:
doc3 = nlp_pipeline("New Purchases: UNVR, ASTE, THRM, Added Positions: IAC, RYAM, ITRI, FG, BLDR, VRTS, HZN, Reduced Positions: DXC, AIR, Sold Out: DECK, BWLD, BID, RCII")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

ASTE 5 6 21 25 ORG
IAC 12 13 50 53 ORG
RYAM 14 15 55 59 ORG
ITRI 16 17 61 65 ORG
FG 18 19 67 69 ORG
BLDR 20 21 71 75 ORG
VRTS 22 23 77 81 ORG
HZN 24 25 83 86 ORG
AIR 31 32 112 115 ORG
Sold Out 33 35 117 125 GPE
BWLD 38 39 133 137 ORG
BID 40 41 139 142 ORG


In [27]:
doc3 = nlp_pipeline("Marcato Capital Management Llc Buys Univar Inc, Astec Industries Inc, IAC/InterActiveCorp, ...")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Marcato Capital Management Llc Buys Univar Inc 0 7 0 46 ORG
Astec Industries Inc 8 11 48 68 ORG
IAC/InterActiveCorp 12 15 70 89 ORG


In [28]:
doc3 = nlp_pipeline("Bragar Eagel & Squire, P.C. Announces That a Class Action Lawsuit Has Been Filed Against ...")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Bragar Eagel & Squire 0 4 0 21 ORG
P.C. Announces 5 7 23 37 GPE


In [29]:
doc3 = nlp_pipeline("Bulldog Investors, LLC buys Korea Equity Fund, Stellar Acquisition III Inc, New York REIT Inc, ...")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Bulldog Investors 0 2 0 17 ORG
LLC 3 4 19 22 ORG
Korea Equity Fund 5 8 28 45 ORG
Stellar Acquisition III Inc 9 13 47 74 ORG
New York REIT Inc 14 18 76 93 ORG


In [30]:
doc3 = nlp_pipeline("Saba Capital Management, L.P. Buys Allergan PLC, Alpine Total Dynamic Dividend Fund,")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Saba Capital Management 0 3 0 23 ORG
L.P. Buys Allergan PLC 4 8 25 47 ORG
Alpine Total Dynamic Dividend Fund 9 14 49 83 ORG


In [31]:
# compare each bert token with spacy token
# bert recognizes the verbs better
# if a bert ORG token is not there in spacy list, add it
# if a spacy ORG token is not there in bert list, add it
# if a spacy ORG token starts with bert token, but is longer in length, take spacy token (because spacy does a better job here)

In [32]:
doc3 = nlp_pipeline("Amazon invests $2B to expand operations in India")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Amazon 0 1 0 6 ORG
2B 3 4 16 18 MONEY
India 8 9 43 48 GPE


In [33]:
doc3 = nlp_pipeline("Amazon to open checkout free store in New York")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Amazon 0 1 0 6 ORG
New York 7 9 38 46 GPE


In [34]:
doc3 = nlp_pipeline("Molson Coors Buys StarBev")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Coors 1 2 7 12 ORG


In [35]:
doc3 = nlp_pipeline("Sun Pharma to develop chikungunya, zika drugs")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Sun Pharma 0 2 0 10 ORG


In [36]:
doc3 = nlp_pipeline("Apple buys self driving car startup Drive.ai")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Apple 0 1 0 5 ORG
Drive.ai 6 7 36 44 ORG


In [37]:
doc3 = nlp_pipeline("Aaron's Acquires Crusader - Analyst Blog")
for ent in doc3.ents: 
   print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Aaron 0 1 0 5 PERSON
Acquires Crusader 2 4 8 25 ORG
