In [1]:
from az_common_funcs import *

2024-03-11 19:08:04,435 :: [INFO] :: Logger initialized WITHOUT file handler


In [2]:
import stanza

# Download the language model
stanza.download('en')

sentence = 'Deemed universities charge huge fees'

# Build a Neural Pipeline
nlp = stanza.Pipeline('en', processors = "tokenize,mwt,pos,lemma,depparse") 

# Pass the sentence through the pipeline
doc = nlp(sentence)

# Print the dependencies of the first sentence in the doc object
# Format - (Token, Index of head, Nature of dependency)
# Index starts from 1, 0 is reserved for ROOT
doc.sentences[0].print_dependencies()


print ("{:<15} | {:<10} | {:<15} ".format('Token', 'Relation', 'Head'))
print ("-" * 50)
  
# Convert sentence object to dictionary  
sent_dict = doc.sentences[0].to_dict()

# iterate to print the token, relation and head
for word in sent_dict:
  print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']),str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-11 19:08:10 INFO: Downloading default packages for language: en (English) ...
2024-03-11 19:08:10 INFO: File exists: /home/azikre/stanza_resources/en/default.zip
2024-03-11 19:08:14 INFO: Finished downloading models and saved to /home/azikre/stanza_resources.
2024-03-11 19:08:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-11 19:08:14 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-03-11 19:08:15 INFO: Using device: cuda
2024-03-11 19:08:15 INFO: Loading: tokenize
2024-03-11 19:08:15 INFO: Loading: mwt
2024-03-11 19:08:15 INFO: Loading: pos
2024-03-11 19:08:16 INFO: Loading: lemma
2024-03-11 19:08:16 INFO: Loading: depparse
2024-03-11 19:08:16 INFO: Done loading processors!


('Deemed', 2, 'amod')
('universities', 3, 'nsubj')
('charge', 0, 'root')
('huge', 5, 'amod')
('fees', 3, 'obj')
Token           | Relation   | Head            
--------------------------------------------------
Deemed          | amod       | universities    
universities    | nsubj      | charge          
charge          | root       | ROOT            
huge            | amod       | fees            
fees            | obj        | charge          


In [3]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

2024-03-11 19:10:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-11 19:10:46 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-03-11 19:10:46 INFO: Using device: cuda
2024-03-11 19:10:46 INFO: Loading: tokenize
2024-03-11 19:10:46 INFO: Loading: mwt
2024-03-11 19:10:46 INFO: Loading: ner
2024-03-11 19:10:47 INFO: Done loading processors!


In [4]:
doc = nlp("Chris Manning teaches at Stanford University. He lives in the Bay Area. Tomorrow He shall come.")

In [5]:
print(f"{'Entity Name':<30} | {'Entity Type':<30}")
print("="*60)
print(*[f'{ent.text:<30} | {ent.type:<30}' for sent in doc.sentences for ent in sent.ents], sep='\n')

Entity Name                    | Entity Type                   
Chris Manning                  | PERSON                        
Stanford University            | ORG                           
the Bay Area                   | LOC                           
Tomorrow                       | DATE                          


[('Chris Manning', 'PERSON'),
 ('Stanford University', 'ORG'),
 ('the Bay Area', 'LOC'),
 ('Tomorrow', 'DATE')]

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

print(f"{'TEXT':<30} | {'ROOT.TEXT':<30} | {'ROOT._DEP':<30} | {'ROOT.HEAD.TEXT':<30}")
print("="*120)
for chunk in doc.noun_chunks:
    print(f"{chunk.text:<30} | {chunk.root.text:<30} | {chunk.root.dep_:<30} | {chunk.root.head.text:<30}")

print()
print(f"{'TEXT':<30} | {'START_CHAR':<30} | {'END_CHAR':<30} | {'LABEL_':<30}")
print("="*120)
for ent in doc.ents:
    print(f"{ent.text:<30} | {ent.start_char:<30} | {ent.end_char:<30} | {ent.label_:<30}")

TEXT                           | ROOT.TEXT                      | ROOT._DEP                      | ROOT.HEAD.TEXT                
Autonomous cars                | cars                           | nsubj                          | shift                         
insurance liability            | liability                      | dobj                           | shift                         
manufacturers                  | manufacturers                  | pobj                           | toward                        

TEXT                           | START_CHAR                     | END_CHAR                       | LABEL_                        


In [9]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

print(f"{'TEXT':<30} | {'ROOT.TEXT':<30} | {'ROOT._DEP':<30} | {'ROOT.HEAD.TEXT':<30}")
print("="*120)
for chunk in doc.noun_chunks:
    print(f"{chunk.text:<30} | {chunk.root.text:<30} | {chunk.root.dep_:<30} | {chunk.root.head.text:<30}")

print()
print(f"{'TEXT':<30} | {'START_CHAR':<30} | {'END_CHAR':<30} | {'LABEL_':<30}")
print("="*120)
for ent in doc.ents:
    print(f"{ent.text:<30} | {ent.start_char:<30} | {ent.end_char:<30} | {ent.label_:<30}")

TEXT                           | ROOT.TEXT                      | ROOT._DEP                      | ROOT.HEAD.TEXT                
Apple                          | Apple                          | nsubj                          | looking                       
U.K.                           | U.K.                           | dobj                           | buying                        

TEXT                           | START_CHAR                     | END_CHAR                       | LABEL_                        
Apple                          | 0                              | 5                              | ORG                           
U.K.                           | 27                             | 31                             | GPE                           
$1 billion                     | 44                             | 54                             | MONEY                         
