In [None]:
!pip install -r requirements.txt

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
label_names = ['0', 'B-Age', 'I-Age', 'B-Sex', 'I-Sex', 'B-Clinical Event', 'I-Clinical Event', 'B-Symptom', 'I-Symptom', 'B-Detailed Description', 'I-Detailed Description', 'B-History', 'I-History', 'B-Disease', 'I-Disease', 'B-Biological Structure', 'I-Biological Structure', 'B-Diagnostic Procedure', 'I-Diagnostic Procedure', 'B-Coreference', 'I-Coreference', 'B-Texture', 'I-Texture', 'B-Lab Value', 'I-Lab Value', 'B-Severity', 'I-Severity', 'B-Biological Attribute', 'I-Biological Attribute', 'B-Activity', 'I-Activity', 'B-Duration', 'I-Duration', 'B-Shape', 'I-Shape', 'B-Therapeutic Procedure', 'I-Therapeutic Procedure', 'B-Color', 'I-Color', 'B-Date', 'I-Date', 'B-Outcome', 'I-Outcome', 'B-Location', 'I-Location', 'B-Distance', 'I-Distance', 'B-Area', 'I-Area', 'B-Other Event', 'I-Other Event', 'B-Subject', 'I-Subject', 'B-Personal Background', 'I-Personal Background', 'B-Medication', 'I-Medication', 'B-Dosage', 'I-Dosage', 'B-Occupation', 'I-Occupation', 'B-Administration', 'I-Administration', 'B-Frequency', 'I-Frequency', 'B-Quantitative Concept', 'I-Quantitative Concept', 'B-Family History', 'I-Family History', 'B-Qualitative Concept', 'I-Qualitative Concept', 'B-Volume', 'I-Volume', 'B-Time', 'I-Time', 'B-Height', 'I-Height', 'B-Weight', 'I-Weight', 'B-Other Entity', 'I-Other Entity', 'B-Mass', 'I-Mass', 'B-Phone Number', 'I-Phone Number', 'B-Email', 'I-Email', 'B-Asset', 'I-Asset', 'B-Debt', 'I-Debt', 'B-Currency', 'I-Currency', 'B-Business', 'I-Business', 'B-Expense', 'I-Expense', 'B-Financial Institution', 'I-Financial Institution', 'B-Financial Profession', 'I-Financial Profession', 'B-Financial Services', 'I-Financial Services', 'B-Financial Product', 'I-Financial Product']

In [None]:
id2label = {}
label2id = {}
for i in range(len(label_names)):
	id2label[i] = label_names[i]
	label2id[label_names[i]] = i
print(id2label)
print(label2id)
print(label_names)

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("json", data_files = {'train' : 'data/fin_med_traindata_new.json', 'validation' : 'data/fin_med_valdata_new.json', 'test' : 'data/fin_med_testdata_new.json'})
raw_datasets

In [None]:
label_list = label_names

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True, padding = True)

In [None]:
label_all_tokens = True
task = "ner"

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding = True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            #The label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            #The label for the remaining tokens of the word    
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division = 1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
training_args = TrainingArguments(
    output_dir="trained_model",
    learning_rate=8e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=4,
    num_train_epochs= 20,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy = "epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    lr_scheduler_type = "cosine"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    # callbacks = [callback]
)

In [None]:
trainer.train()

In [None]:
# text = "anbdk@gmail.com"
# text = '''My name is John Adams. I am 25 years old. I am suffering from fever and chest pain. It looks like flu or covid. I am taking Tylenol now. Please give me a call at 978-677-3456. You can also email me at aband@gmail.com. I can speak English and Spanish fluently'''
# text = "Contact me on john@company.com or on 934-323-4432."
# text = "I am John Doe, and I was recently diagnosed with stage 2 pancreatic cancer. I am going to start chemotherapy from next week and will undergo surgery."
# text = "Can I deduct the full value of illiquid stock donation out of ordinary income if I put it into a private foundation? in my taxes"
# text = "John Doe bought gold in cash by paying 600 dollars and opened a savings account at the bank. He paid for it in euros"
# text = "The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money."
# text = "That is kind of the point, one of the hopes is that it incentivizes banks to stop storing money and start injecting it into the economy themselves. Compared to the European Central Bank investing directly into the economy the way the US central bank has been doing. (The Federal Reserve buying mortgage backed securities) On a country level, individual European countries have tried this before in recent times with no noticeable effect."
# text = "Pay off your debt. As you witnessed, no 'investment' % is guaranteed. But your debt payments are... so if you have cash, the best way to 'invest' it is to pay off your debt. Since your car is depreciating while your house may be appreciating (don't know but it's possible) you should pay off your car loan first. You're losing money in more than one way on that investment."
# text = "Equity options, at least those traded in the American stock exchanges, actually expire the Saturday after the 3rd Friday of the month. However, the choice to trade or exercise the options must be specified by the 3rd Friday. This is outlined by the CBOE, who oversees the exchange of equity options. Their FAQ regarding option expiration can be found at http://www.cboe.com/LearnCenter/Concepts/Beyond/expiration.aspx."
# text = "For eToro, just like any other brokerage firm, you can lose your entire capital. I suggest that you invest in one or more exchange-traded funds that track major indexes. If not, just put your money in fixed deposit accounts; gain a bit of interest and establish an emergency fund first before investing money that you feel you are able to lose."
# text = "Buy a car. Unless you definitely know you are living in the area for a good long time, avoid buying a house and get a car instead."
# text = "This sound like a very bad idea. If you invest exclusively in silver, your investment is not diversified in any way. This is what I would call risky. Have a look at index funds and ETFs and build a diversified portfolio. It does not take much time, and you don't need to let it do by someone else. They are risky too, but I see 'silver only' as much riskier. You reduce the risk by holding on to the funds for a long time."
# text = "In India, where I live, you can: In addition, housing loans are given priority status as well - bank capital requirements on housing loans is lower than for, say, a corporate loan or a loan against other kinds of collateral. That makes housing loans cheaper as well - you get a home loan at around 10% in India versus 15% against most other assets, and since you can deduct it against tax, the effective interest rate is even lower. Housing in India is unaffordable too, if you're wondering. In a suburb 40 Km away from Delhi, a 2000 sq. foot apartment, about 1500 sq. ft. of carpet area, with no appliances costs about US Dollars 250,000."
# text = "Another form of 'shareholder' activism. You might be able to buy a single share, which it seems would cost around $35, attend the AGM, and ask questions and/or shout or sing and delay proceedings. There would certainly be security guards or police ready to remove protesters at an AGM."
# text = "If I held stock in these companies yesterday, would I have profited by these gains? No. For DZSI, your 5 shares at $1.10 would now be 1 share at $5.50, so you would have the same total amount. For SGY, they closed at $6.95, and opened at $32.80, so your five shares at $6.95 would now be one share at $32.80, so you would have actually lost money (not purely because of the split, but because the 'new' shares are trading lower then the expected 1:5 split price). A split in general does not affect market cap (how much your total shares are worth) but there may be residual effects that cause the market value to fluctuate after a split that affect the price."
# text = "It is a question of how volatile the stock is perceived to be, its beta correlation to the S&P500 or other index. Margin requirements are derived from the Federal Reserve, Self Regulatory Organizations, the exchange itself, the broker you use, and which margining system you are using. So that makes this a loaded question. There are at least three margin systems, before you have your own risk officer in a glass room that doesn't care how leveraged up you get. Brokers primarily don't want to lose money."
# text = "gold up $3.10 at $898.30 an ounce Q: Does the news headline talk about a general event (apart from prices) in the future? No gold is the only place for new money in june Q: Does the news headline talk about a general event (apart from prices) in the past? Yes lbma : industry insiders bullish on gold, forecast of $1585 (radio) Q: Does the news headline compare gold with any other asset? No dec. gold down 20 cents at $1,384.80/oz on globex Q: Does the news headline talk about price in the future? No Gold futures climb to Rs 25,050 per 10 gm Q: Does the news headline talk about price going down? No gold prices to reach $2,000 by march 2012: scotia Q: Does the news headline compare gold with any other asset?"
# text = "jpmorgan chase annual report 31 the following section provides a comparative discussion of jpmorgan chase 2019s consolidated results of operations on a reported basis for the three-year period ended december 31 2007. factors that relate primarily to a single business segment are discussed in more detail within that business segment than they are in this consolidated sec- tion. for a discussion of the critical accounting estimates used by the firm that affect the consolidated results of operations, see pages 96 201398 of this annual report. revenue."
# text = "entergy new orleans, inc. management's financial discussion and analysis net revenue 2008 compared to 2007 net revenue consists of operating revenues net of: 1) fuel, fuel-related expenses, and gas purchased for resale, 2) purchased power expenses, and 3) other regulatory charges. following is an analysis of the change in net revenue comparing 2008 to 2007. amount (in millions). - | amount (in millions) 2007 net revenue | $231.0 volume/weather | 15.5 net gas revenue | 6.6 rider revenue | 3.9 base revenue | -11.3 (11.3) other | 7.0 2008 net revenue | $252.7 the volume/weather variance is due to an increase in electricity usage in the service territory in 2008 compared to the same period in 2007."
# text = "A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2-3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea. Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings. An electrocardiogram (ECG) revealed normal sinus rhythm and a Wolff- Parkinson- White pre-excitation pattern (Fig.1: Top), produced by a right-sided accessory pathway."
# text = "The COVID pandemic has accelerated the adoption of remote work and digital."

from transformers import pipeline

classifier = pipeline(task = "token-classification", model= "./bert_fin_med_model_02_29_test_02/checkpoint-440")
final_lst = classifier(text)

for element in final_lst:
    print(element)

In [None]:
##### #######
# remove punctuations for non email label
# Adjust the '##' Entities
# Adjust the '-' entities
# Adjust the 'I-' entities

# remove entities with low score and '0' label


# adjust emails
# adjust lab values
# adjust back to back words with same entities.

fin_label_list = ['Asset','Business','Debt', 'Expense','Currency', 'Financial Institution', 'Financial Services', 'Financial Profession', 'Financial Product']


final_lst1 = []
counter = 0
new_lst_num1 = 0
for new_element in final_lst:
    counter +=1
    if new_element['word'] in ['.',';', "'"] and new_element["entity"] != 'B-Email':
        continue
        
    if counter < (len(final_lst) - 1) and  new_element['entity'] == '0' and final_lst[counter]['word'][0:2] == '##':
        final_lst[counter]['word'] = new_element["word"] + final_lst[counter]['word'][2:]
    # if new_element["word"][0:2] == "##" and new_element['index']>0:
    #     new_element["word"] = final_lst[counter -2]["word"] = new_element["word"][2:]
        
    if new_element["word"][0:2] == "##" and final_lst[counter -2]["entity"] != '0':
        # print(element["word"])
        final_lst1[new_lst_num1-1]["word"] = final_lst1[new_lst_num1-1]["word"] + new_element["word"][2:]
        final_lst1[new_lst_num1-1]["end"] = new_element["end"]
        if final_lst1[new_lst_num1-1]['score']<new_element['score']:
            final_lst1[new_lst_num1-1]['score']=new_element['score']
            if new_element['entity'] != '0':
                final_lst1[new_lst_num1-1]['entity']=final_lst1[new_lst_num1-1]['entity'][0:2]+new_element['entity'][2:]
        # final_lst.remove(new_element)
        # counter -=1
        continue
    if (new_element["word"][0] == '-'  or final_lst[counter-2]["word"][0] == '-')  and ((new_element['start'] -  final_lst1[new_lst_num1-1]["end"]) < 10):
        final_lst1[new_lst_num1-1]["word"] = final_lst1[new_lst_num1-1]["word"] + new_element["word"]
        final_lst1[new_lst_num1-1]["end"] = new_element["end"]
        if final_lst1[new_lst_num1-1]['score']<new_element['score']:
            final_lst1[new_lst_num1-1]['score']=new_element['score']
            if new_element['entity'] != '0':
                final_lst1[new_lst_num1-1]['entity']=final_lst1[new_lst_num1-1]['entity'][0:2]+new_element['entity'][2:]
        
        continue
    
    if new_element["entity"][:2] == 'I-' and final_lst[counter -2]["entity"] != '0':
        final_lst1[new_lst_num1-1]["word"] = final_lst1[new_lst_num1-1]["word"] + " " + new_element["word"]
        final_lst1[new_lst_num1-1]["end"] = new_element["end"]
        if final_lst1[new_lst_num1-1]['score']<new_element['score']:
            final_lst1[new_lst_num1-1]['score']=new_element['score']
            final_lst1[new_lst_num1-1]['entity']=final_lst1[new_lst_num1-1]['entity'][0:2]+new_element['entity'][2:]
        continue
    if new_element['entity'] == '0' or new_element["score"]<0.20:
        continue
    else:
        if new_element['entity'] == 'I-Business':
            new_element['entity'] = 'B-Business'
        if new_element['word'] == 'covid':
            new_element['entity'] = 'B-Disease Disorder'
        final_lst1.append(new_element)
        # print(new_element)
        new_lst_num1 += 1
for element in final_lst1:
    print(element)

In [None]:
import string
new_lst = []
# words = []
elemnt_num = 0
new_lst_num = 0
for element in final_lst1:
    elemnt_num +=1 
    
    ##### Convert B-Age to I-Age when there are -. and then use the existing logic. Or fix it in training.
    if element["word"][0:2] == "##" and element["entity"][2:] == new_lst[new_lst_num-1]["entity"]:
        # print(element["word"])
        
        new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + element["word"][2:]
        new_lst[new_lst_num-1]["end"] = element["end"]
        if new_lst[new_lst_num-1]['score']<element['score']:
            new_lst[new_lst_num-1]['score']=element['score']
        continue
        
    if (element["word"][0] == '-'  or final_lst1[elemnt_num-2]["word"][0] == '-') and element['entity'][2:] == new_lst[new_lst_num-1]["entity"]:
        # print(new_lst[new_lst_num-1]["word"])
        new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + element["word"]
        new_lst[new_lst_num-1]["end"] = element["end"]
        if new_lst[new_lst_num-1]['score']<element['score']:
            new_lst[new_lst_num-1]['score']=element['score']
        # elemnt_num += 1
        
        continue
        
    if element["entity"][0:2]=='I-':
        # print(element["entity"][2:])
        # print(final_lst1[elemnt_num-2]["entity"][2:])
        if len(new_lst) >0 and element["entity"][2:]==new_lst[new_lst_num-1]["entity"] and element['start'] -  new_lst[new_lst_num-1]["end"] < 5:
            # print('testdone')
            new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + " " + element["word"]
            new_lst[new_lst_num-1]["end"] = element["end"]
            if new_lst[new_lst_num-1]['score']<element['score']:
                new_lst[new_lst_num-1]['score']=element['score']
            # print(new_lst[new_lst_num-1]["word"])
            continue
        else:
            continue
    # if element["entity"][0:2]=='I-':   
    #     print(new_lst[new_lst_num-1]["entity"])
    #     if element["entity"][2:] in fin_label_list and new_lst[new_lst_num-1]["entity"] not in fin_label_list:
            # new_lst[new_lst_num-1]["entity"] = element["entity"]
            # new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + " " + element["word"]
            # new_lst[new_lst_num-1]["end"] = element["end"]
            # continue
        
    # print(final_lst1[elemnt_num-2]["word"])
    if element["entity"] == "B-Email":
        if element["word"] == '@'  or final_lst1[elemnt_num-2]["word"] == '@':# or final_lst1[elemnt_num-3]["word"] == '@' or final_lst1[elemnt_num-4]["word"] == '@' or final_lst1[elemnt_num-5]["word"] == '@':
            new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + element["word"]
            new_lst[new_lst_num-1]["end"] = element["end"]
            if new_lst[new_lst_num-1]['score']<element['score']:
                new_lst[new_lst_num-1]['score']=element['score']
            continue
        elif (element["word"] == '.'  or (final_lst1[elemnt_num-2]["word"] == '.' and len(new_lst) >0)):
            new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + element["word"]
            new_lst[new_lst_num-1]["end"] = element["end"]
            if new_lst[new_lst_num-1]['score']<element['score']:
                new_lst[new_lst_num-1]['score']=element['score']
            continue
            
    if element['entity'] == 'B-Lab_value':
        if element['word'] in string.punctuation or final_lst1[elemnt_num-2]['word'] in string.punctuation:
            new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + element["word"]
            new_lst[new_lst_num-1]["end"] = element["end"]
            if new_lst[new_lst_num-1]['score']<element['score']:
                new_lst[new_lst_num-1]['score']=element['score']
            continue
    if len(new_lst)>0 and element['entity'][2:] == new_lst[new_lst_num-1]["entity"] and element['start'] == new_lst[new_lst_num-1]["end"] +1 and text[new_lst[new_lst_num-1]["end"]]==' ':
        new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] +' ' + element["word"]
        new_lst[new_lst_num-1]["end"] = element["end"]
        if new_lst[new_lst_num-1]['score']<element['score']:
            new_lst[new_lst_num-1]['score']=element['score']
        continue
    if new_lst_num >0 and element["index"] == new_lst[new_lst_num-1]["index"] + 1:
        if element["entity"][2:] in fin_label_list and new_lst[new_lst_num-1]["entity"] not in fin_label_list:
            new_lst[new_lst_num-1]["entity"] = element["entity"][2:]
            new_lst[new_lst_num-1]["word"] = new_lst[new_lst_num-1]["word"] + " " + element["word"]
            new_lst[new_lst_num-1]["end"] = element["end"]
            continue
    
    if element['word'] in string.punctuation:
        continue
        
    element["entity"] = element["entity"][2:]

    new_lst.append(element)
    # print(new_lst)
           
    new_lst_num +=1
    # print(new_lst_num)
        
    
for element in new_lst:
    print(element)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
english_stopwords = stopwords.words('english')

list_of_final_words = []

for element in new_lst:
    if element['entity'] == 'Phone_Number' and (element['end']-element['start'] >14):
        continue
    if element['entity'] in ['Activity', 'Duration', 'Occupation'] and element['score'] < 0.5:
        continue
    tokens = element['word'].split()
    while tokens and tokens[0].lower() in english_stopwords:
        tokens.pop(0)
    while tokens and tokens[-1].lower() in english_stopwords:
        tokens.pop()
    element['word'] = ' '.join(tokens)
    list_of_final_words.append(element)
    print(element)

In [None]:
words = []
start_keys = []
end_keys = []
for dict_element in list_of_final_words:
    if dict_element["entity"] != '0' and dict_element["word"] not in words:
        words.append(dict_element["word"])
        start_keys.append(dict_element["start"])
        end_keys.append(dict_element["end"])
        
print(words)

In [None]:
import spacy
nlp = spacy.load('en_core_web_trf')
doc = nlp(text)
    # print(doc.ents)
for word in doc.ents:
    new_dict = {}
    print(word)
    if word.text.lower() in words:
        ind = words.index(word.text.lower())
        if word.label_.title() == 'Money':
            new_dict["entity"] = word.label_.title()
            new_dict["word"] = word.text
            words.append(word.text)
            new_dict["start"] = word.start_char
            new_dict["end"] = word.end_char
            list_of_final_words.append(new_dict)
            del list_of_final_words[ind]
        if list_of_final_words[ind]['entity'] in ['Business', 'History'] and word.label_.title() in ['Person', 'Org']:
            new_dict["entity"] = word.label_.title()
            new_dict["word"] = word.text
            words.append(word.text)
            new_dict["start"] = word.start_char
            new_dict["end"] = word.end_char
            list_of_final_words.append(new_dict)
            del list_of_final_words[ind]
        # continue
    elif word.start_char in start_keys or word.end_char in end_keys:
            continue
    else:
        if word.label_ != 'CARDINAL' and word.label_ != 'ORDINAL':
            new_dict["entity"] = word.label_.title()
            new_dict["word"] = word.text
            words.append(word.text)
            new_dict["start"] = word.start_char
            new_dict["end"] = word.end_char
            list_of_final_words.append(new_dict)
print(list_of_final_words)