In [1]:
import spacy

https://tecoholic.github.io/ner-annotator/

In [6]:
nlp = spacy.load("en_core_web_sm")

In [9]:
text = """Seeking legal resources for a land dispute case in Westchester. 
Specifically interested in real estate and property law expertise, with a focus on zoning regulations and property rights. 
Require support from a Spanish-speaking legal team and prefer an office located within the Westchester area. 
Appreciate any relevant case studies, legal documentation, or scholarly articles."""


In [2]:
#selections for GPT to use to generate training data
practice_areas = ['labour', 'corporate', 'intellectual property', 'criminal', 'family', 'real estate', 'property'
                  , 'administrative', 'commercial', 'bankruptcy', 'immigration', 'tax', 'civil', 'health', 'insurance'
                  , 'construction', 'dispute resolution', 'environmental', 'lawsuit', 'business', 'competition'
                  , 'constitutional', 'education']


locations = ['chicago','new york','washington','pittsburgh','los angeles','boston','miami','atlanta','richmond','milwaukee'
            , 'seattle','san francisco','palo alto','cleveland','san diego', 'houston','kansas city','nashbille','philadelphia'
            , 'detroit','dallas']


languages = ['english','french','spanish','german','chinese','japanese','korean','arabic','portuguese','russian','hindi'
             , 'malay','thai']

In [10]:
doc = nlp(text)

In [12]:
print (doc)

Seeking legal resources for a land dispute case in Westchester. 
Specifically interested in real estate and property law expertise, with a focus on zoning regulations and property rights. 
Require support from a Spanish-speaking legal team and prefer an office located within the Westchester area. 
Appreciate any relevant case studies, legal documentation, or scholarly articles.


In [15]:
legal_entities = []
for ent in doc.ents:
    #print (ent)
    print(ent.text, ":", ent.label_)
    if ent.label_ == "LAW" or ent.label_ == "LOC":
        legal_entities.append(ent.text)

print("Legal entities found:")
for entity in legal_entities:
    print(entity)



Westchester : ORG
Specifically : ORG
Spanish : LANGUAGE
Westchester : LOC
Legal entities found:
Westchester


In [16]:
#all entities that this model supports
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [38]:
#setting custom entity

from spacy.training import Example

nlp = spacy.blank("en")

label = 'PRACTICE_AREA'
train_data = [
    ("interested in real estate and property law expertise, with a focus on zoning regulations and property rights", {"entities": [(2, 4, label)]}),
    ("interested in real estate and property law expertise, with a focus on zoning regulations and property rights", {"entities": [(5, 7, label)]})
    
]

# Add the custom entity to the model's pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add your custom label to the model's entity recognizer
ner.add_label(label)

# Begin the training process
nlp.begin_training()


# Train the model with your annotated data
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    nlp.update([example], losses={})
    
    
    
# Save the trained model
nlp.to_disk("custom_ner_model")

# Load the trained custom NER model
nlp = spacy.load("custom_ner_model")  # Replace with the path to your saved model


# Sample text for testing
test_text = "We need legal advice for a potential real estate transaction."

# Apply the NER model to the test text
doc = nlp(test_text)

# Extract entities from the test text
for ent in doc.ents:
    print(ent.text, ent.label_)

print('is this working')

is this working


In [32]:
test_text = "We need legal advice for a potential real estate transaction."

In [33]:
doc = nlp(test_text)

In [36]:
for ent in doc.ents:
    print (ent)
    print(ent.text, ":", ent.label_)

In [35]:
print(doc)

We need legal advice for a potential real estate transaction.


Building custom NER w Spacy

In [39]:
!python -m spacy info


[1m

spaCy version    3.5.3                         
Location         C:\Users\gray.kim\AppData\Local\anaconda3\Lib\site-packages\spacy
Platform         Windows-10-10.0.19045-SP0     
Python version   3.11.3                        
Pipelines        en_core_web_sm (3.5.0)        



In [2]:

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [3]:

import json
f = open('custom_ner_model/sample_data/training_data.json')
train_data = json.load(f)

In [56]:
train_data['annotations'][0]

['Our legal team specializing in labour law is currently managing a case in Chicago. We require comprehensive legal resources and expertise in employment contracts and labor disputes. Fluency in Spanish within the legal team is essential for effective communication with our diverse workforce. Please provide any relevant case studies or legal documentation related to labour law.\r',
 {'entities': [[31, 41, 'PRACTICE_AREA'],
   [74, 82, 'LOCATION'],
   [141, 151, 'PRACTICE_AREA'],
   [152, 161, 'PRACTICE_AREA'],
   [166, 181, 'PRACTICE_AREA'],
   [193, 200, 'LANG'],
   [367, 377, 'PRACTICE_AREA']]}]

In [57]:
for text, annot in tqdm(train_data['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 6/6 [00:00<00:00, 1032.74it/s]


In [58]:

! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[!] To generate a more effective transformer-based config (GPU-only), install
the spacy-transformers package and re-run this command. The config generated now
does not use transformers.
[i] Generated config template specific for your use case
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [59]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy


[i] Saving to output directory: .
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     31.61    0.00    0.00    0.00    0.00
 33     200        314.95   1157.86  100.00  100.00  100.00    1.00
 82     400          0.00      0.00  100.00  100.00  100.00    1.00
146     600          0.00      0.00  100.00  100.00  100.00    1.00
225     800          0.00      0.00  100.00  100.00  100.00    1.00
325    1000          0.00      0.00  100.00  100.00  100.00    1.00
425    1200          0.00      0.00  100.00  100.00  100.00    1.00
584    1400          0.00      0.00  100.00  100.00  100.00    1.00
784    1600          0.00      0.00  100.00  100.00  100.00    1.00
984    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline to output directory

[2023-10-14 13:33:39,537] [INFO] Set up nlp object from config
[2023-10-14 13:33:39,552] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-10-14 13:33:39,557] [INFO] Created vocabulary
[2023-10-14 13:33:39,557] [INFO] Finished initializing nlp object
[2023-10-14 13:33:39,730] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


## model testing

In [1]:
#lib import
import spacy

#func
#categorize into keyword buckets
def buckets(prompt):
    
    prac_a = []
    expert = []
    lang = []
    locat = []
    
    for ent in prompt.ents:
    #print(ent.text,":",ent.label_)
        if ent.label_ == 'PRACTICE_AREA': prac_a.append(ent.text)
        elif ent.label_ == 'EXPERTISE': expert.append(ent.text)
        elif ent.label_ == 'LANG': lang.append(ent.text)
        elif ent.label_ == 'LOCATION': locat.append(ent.text)
    
    return prac_a, expert, lang, locat

In [2]:
#load model
nlp_ner = spacy.load("model-best") 

In [7]:
long_prompt = nlp_ner("""Our legal team, specializing in corporate law, is currently managing a complex business acquisition case in Washington. We require comprehensive legal resources and expertise in mergers and acquisitions, with a specific focus on due diligence and contract negotiations. Fluency in Spanish within the legal team is crucial for effective communication with our diverse international stakeholders. Our legal office, strategically located in the heart of Washington, allows us to stay updated with the latest legislative changes and business regulations.

In addition to our involvement in corporate law, we have recently taken on a significant construction dispute case in Boston. We are seeking expert guidance in construction law and arbitration procedures to resolve the ongoing disputes over contractual obligations and project timelines. Proficiency in Mandarin within the legal team is essential as we are working closely with several Chinese construction firms involved in the dispute. Our extensive experience in handling complex construction cases has enabled us to provide effective legal strategies and negotiation tactics to ensure swift resolutions.

Furthermore, our firm has been dedicated to providing comprehensive legal services in intellectual property law in San Francisco for over two decades. We specialize in patent registrations, trademark infringements, and copyright litigations, catering to a diverse clientele ranging from tech startups to established multinational corporations. With our multilingual legal team proficient in German, Japanese, and French, we have successfully represented our clients in numerous high-profile IP cases both domestically and internationally.

As part of our commitment to serving our clients' needs, we have expanded our practice to include healthcare law and regulatory compliance in New York. Our legal experts are well-versed in healthcare regulations, compliance standards, and litigation procedures, providing strategic counsel to healthcare providers, institutions, and pharmaceutical companies. Fluency in Arabic within our legal team has enabled us to effectively communicate with our Middle Eastern clients and navigate the complexities of healthcare regulations in the international market.

Our firm's diverse expertise extends to environmental law, with a particular focus on sustainability and green initiatives in Los Angeles. We have successfully represented clients in cases involving environmental compliance, renewable energy projects, and land development regulations. Our proficiency in Korean and Chinese languages has facilitated our engagement with international stakeholders seeking our legal guidance on various environmental initiatives and projects.

With our dedication to providing top-tier legal services across diverse practice areas and our multilingual capabilities, we remain committed to serving our clients with the highest standards of professionalism and expertise.""")


In [8]:
#print out results

p,e,la,lo = buckets(long_prompt)

print('Practice Areas: ', p)
print('Expertise: ', e)
print('Language: ', la)
print('Location: ', lo)


Practice Areas:  ['corporate law', 'mergers and', 'corporate law', 'construction law', 'intellectual property', 'patent registrations', 'environmental law', 'Korean', 'languages has']
Expertise:  ['business acquisition', 'acquisitions, with', 'diligence and contract', 'business regulations.', 'project timelines.', 'the dispute.', 'numerous high', 'internationally.', 'well-versed', 'healthcare regulations', 'litigation procedures,', 'cases involving', 'compliance,', 'development regulations.', 'projects.']
Language:  ['Washington.', 'Spanish', 'Mandarin', 'German', 'Japanese', 'French', 'Arabic', 'Middle Eastern clients', 'Chinese']
Location:  ['Boston.', 'San Francisco', 'New York', 'Los Angeles']


In [None]:
spacy.displacy.render(long_prompt, style="ent", jupyter=True)

#### Test 2

In [3]:
long_prompt_2 = nlp_ner("""Our legal team, specializing in civil law, is currently handling a high-profile lawsuit in Houston. We require comprehensive legal resources and expertise in civil litigation, with a specific focus on dispute resolution and trial preparation. Fluency in French within the legal team is crucial for effective communication with our French-speaking clients. Our office, strategically located in the bustling city of Houston, allows us to provide timely and effective legal representation to our clients.

In addition to our involvement in civil law, we have recently taken on a complex administrative case in San Diego. We are seeking expert guidance in administrative law and regulatory compliance to navigate the intricate regulatory landscape and ensure our clients' interests are protected. Proficiency in Russian within the legal team is essential as we are representing several international clients facing administrative challenges. Our extensive experience in handling administrative cases has equipped us with the necessary skills to advocate for our clients effectively.

Furthermore, our firm has a strong emphasis on competition law and antitrust regulations in Miami. We specialize in providing legal counsel to businesses and corporations, ensuring compliance with competition laws and regulations, and representing clients in cases involving antitrust violations and unfair business practices. With our multilingual legal team proficient in Chinese, Spanish, and Portuguese, we have successfully represented clients in complex antitrust cases both domestically and internationally.

As part of our commitment to serving our clients' diverse needs, we have expanded our practice to include education law and policy advocacy in Philadelphia. Our legal experts are well-versed in education regulations, policy frameworks, and student rights, providing strategic counsel to educational institutions, students, and parents. Fluency in Arabic within our legal team has enabled us to effectively communicate with our Arabic-speaking clients and advocate for their educational rights in the region.

Our firm's diverse expertise also extends to the field of health insurance law, with a particular focus on ensuring fair and just insurance coverage for our clients in Dallas. We have successfully represented numerous clients in cases involving insurance disputes, coverage denials, and bad faith insurance practices. Our proficiency in Japanese and Korean languages has facilitated our communication with international clients seeking our legal guidance on various health insurance matters.

With our commitment to excellence and our multilingual capabilities, we are dedicated to providing top-tier legal services across diverse practice areas and ensuring the protection of our clients' rights and interests.""")


In [4]:
#categorize for export
p,e,la,lo = buckets(long_prompt_2)

print('Practice Areas: ', p)
print('Expertise: ', e)
print('Language: ', la)
print('Location: ', lo)


Practice Areas:  ['civil law', 'civil law', 'interests are', 'laws and']
Expertise:  ['civil litigation', 'ensure our clients', 'handling administrative', 'corporations, ensuring', 'regulations,', 'cases involving', 'business practices.', 'complex antitrust', 'internationally.', 'well-versed', 'education regulations', 'the region.', 'cases involving', 'disputes, coverage', 'rights and interests']
Language:  ['French', 'French-speaking', 'the bustling', 'Houston', 'Russian', 'Chinese', 'Spanish', 'Portuguese', 'Arabic', 'Arabic-speaking', 'Japanese', 'Korean languages']
Location:  ['Houston.', 'San Diego', 'Miami.', 'Philadelphia.', 'Dallas.']


In [14]:
#highlight in text
spacy.displacy.render(long_prompt_2, style="ent", jupyter=True)

#### Test 3


In [15]:
lp3 = nlp_ner("""Our legal team, specializing in dispute resolution, is currently handling a complex case in Milwaukee. We require comprehensive legal resources and expertise in alternative dispute resolution methods and negotiation tactics. Fluency in Korean within the legal team is crucial for effective communication with our Korean-speaking clients. Our office, strategically located in the heart of Milwaukee, allows us to provide timely and effective legal representation to our clients.

In addition to our involvement in dispute resolution, we have recently taken on a sensitive commercial case in Palo Alto. We are seeking expert guidance in commercial law and contract disputes to ensure fair and just business transactions and agreements. Proficiency in Malay within the legal team is essential as we are representing several international companies facing commercial challenges. Our extensive experience in handling commercial cases has equipped us with the necessary skills to negotiate on behalf of our clients effectively.

Furthermore, our firm has a strong emphasis on environmental law and sustainability initiatives in Dallas. We specialize in providing legal counsel to businesses and organizations seeking to comply with environmental regulations and promote green practices. With our multilingual legal team proficient in Arabic, Spanish, and Portuguese, we have successfully guided clients through various environmental initiatives and provided effective solutions to promote sustainable practices.

As part of our commitment to serving our clients' diverse needs, we have expanded our practice to include insurance law and policy advocacy in Cleveland. Our legal experts are well-versed in insurance regulations, policy frameworks, and claim settlements, providing strategic counsel to policyholders and insurance companies. Fluency in Hindi within our legal team has enabled us to effectively communicate with our Hindi-speaking clients and advocate for their insurance rights in the region.

Our firm's diverse expertise also extends to the field of criminal law, with a particular focus on defending clients in Los Angeles. We have successfully represented numerous clients in cases involving criminal defense, plea negotiations, and trial representation. Our proficiency in Russian and Chinese languages has facilitated our communication with international clients seeking our legal guidance on various criminal law matters.

With our commitment to excellence and our multilingual capabilities, we are dedicated to providing top-tier legal services across diverse practice areas and ensuring the protection of our clients' rights and interests.""")

In [16]:
#categorize for export
p,e,la,lo = buckets(lp3)

print('Practice Areas: ', p)
print('Expertise: ', e)
print('Language: ', la)
print('Location: ', lo)

Practice Areas:  ['commercial law', 'environmental law', 'regulations and', 'languages has']
Expertise:  ['dispute resolution', 'alternative dispute', 'negotiation tactics.', 'dispute resolution', 'business transactions', 'handling commercial', 'well-versed', 'insurance regulations', 'insurance companies.', 'the region.', 'cases involving', 'rights and interests']
Language:  ['Korean', 'Korean-speaking', 'Malay', 'Arabic', 'Spanish', 'Portuguese', 'Hindi', 'Hindi-speaking', 'Russian', 'Chinese']
Location:  ['Milwaukee.', 'Palo Alto', 'Dallas.', 'Cleveland.', 'Los Angeles']


In [17]:
#highlight in text
spacy.displacy.render(lp3, style="ent", jupyter=True)

### test with web scraped bio

In [3]:
long_prompt = nlp_ner("""Mike Abcarian is managing partner of the firm's Dallas office. For over 30 years he has represented Fortune 500 corporations, units of local government, and local business interests in labor and employment matters. He has handled hundreds of lawsuits in federal and state courts with an exceptional success record, including lead counsel defense of complex litigation and nationwide class actions. Many of Mike's successful cases resulted in defense verdicts for employer clients following trial by jury. Mike also handles complex workplace safety matters, including fatality investigations, and has represented employers in high-visibility proceedings before the Occupational Safety and Health Administration (OSHA). He has handled significant compensation compliance matters--some involving thousands of employees--in proceedings before the Wage & Hour Division of the U. S. Department of Labor (USDOL). Mike also appears frequently before the Equal Employment Opportunity Commission (EEOC) defending employers in discrimination matters.  He also represents employers before the National Labor Relations Board (NLRB) in union representation proceedings and unfair labor practice proceedings, and in arbitration of labor disputes and labor contract negotiations. Throughout his career, Mike has been a sought-after speaker and a prolific author on labor and employment law issues. He is "AV" Peer Review Rated by Martindale-Hubbell for preeminent skill and ethics, and he has been listed in Texas Super Lawyers every year since 2004. Mike has also been listed in Best Lawyers in America since 2012 and was listed in Chambers USA since 2016. In 2018, Mike was inducted as a Fellow into The College of Labor and Employment Lawyers. Election as a Fellow is the highest recognition by an attorney's colleagues of sustained outstanding performance in the profession, exemplifying integrity, dedication and excellence. """)


In [4]:
#print out results

p,e,la,lo = buckets(long_prompt)

print('Practice Areas: ', p)
print('Expertise: ', e)
print('Language: ', la)
print('Location: ', lo)


Practice Areas:  ['labor and', 'employment matters.', 'federal and', 'litigation and', 'employees--', 'Labor (', 'labor disputes and', 'labor contract negotiations', 'labor and', 'employment law', 'Best Lawyers', 'Employment']
Expertise:  ['corporations,', 'defense verdicts', 'high-', 'proceedings before', 'discrimination matters.', 'union representation', 'arbitration of', 'ethics,', 'The College of Labor', 'the profession', 'excellence.']
Language:  ['Mike Abcarian is', 'Dallas office', 'Safety', 'Health Administration (', 'Commission', 'National Labor Relations Board (', 'Review Rated by', 'Martindale-Hubbell for', 'America', '2018']
Location:  ['Employment Opportunity', 'Texas Super', 'Chambers USA', 'Lawyers.']


In [5]:
print('bad recognition.  Train data with more bio data')

bad recognition.  Train data with more bio data
