## 1. Setup Environment

In [None]:
# import required data sets

#import pandas
import pandas as pd
#import spacy and displacy from spacy
import spacy
from spacy import displacy
#set up nlp
nlp = spacy.load('en_core_web_sm')

## 2. Load the Dataset

In [42]:
#read stock set
stock_set = pd.read_csv("stocks-1.tsv",sep='\t')

#put stock set in a data frame
df_stock = pd.DataFrame(stock_set)
display(df_stock)

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M
...,...,...,...,...
5874,ZWRK,Z-Work Acquisition,Shell Companies,278.88M
5875,ZY,Zymergen,Chemicals,1.31B
5876,ZYME,Zymeworks,Biotechnology,1.50B
5877,ZYNE,Zynerba Pharmaceuticals,Pharmaceuticals,184.39M


## 3. Extract Data for Patterns

In [49]:
# create empty lists
symbol = []
company_name = []

#goes through the dataframe and relabel it to label and pattern for symbol
for x in df_stock["Symbol"]:
    symbol.append({"label":"symbol", "pattern":x})

#goes through the dataframe and relabel it to label and pattern for company name
for x in df_stock["CompanyName"]:
    company_name.append({"label":"companyname", "pattern":x})

#print out to see
print(symbol)
print(company_name)

[{'label': 'symbol', 'pattern': 'A'}, {'label': 'symbol', 'pattern': 'AA'}, {'label': 'symbol', 'pattern': 'AAC'}, {'label': 'symbol', 'pattern': 'AACG'}, {'label': 'symbol', 'pattern': 'AADI'}, {'label': 'symbol', 'pattern': 'AAIC'}, {'label': 'symbol', 'pattern': 'AAL'}, {'label': 'symbol', 'pattern': 'AAMC'}, {'label': 'symbol', 'pattern': 'AAME'}, {'label': 'symbol', 'pattern': 'AAN'}, {'label': 'symbol', 'pattern': 'AAOI'}, {'label': 'symbol', 'pattern': 'AAON'}, {'label': 'symbol', 'pattern': 'AAP'}, {'label': 'symbol', 'pattern': 'AAPL'}, {'label': 'symbol', 'pattern': 'AAQC'}, {'label': 'symbol', 'pattern': 'AAT'}, {'label': 'symbol', 'pattern': 'AATC'}, {'label': 'symbol', 'pattern': 'AAU'}, {'label': 'symbol', 'pattern': 'AAWW'}, {'label': 'symbol', 'pattern': 'AB'}, {'label': 'symbol', 'pattern': 'ABB'}, {'label': 'symbol', 'pattern': 'ABBV'}, {'label': 'symbol', 'pattern': 'ABC'}, {'label': 'symbol', 'pattern': 'ABCB'}, {'label': 'symbol', 'pattern': 'ABCL'}, {'label': 'sym

## 4. Create an EntityRuler

In [50]:
#make sure there isn't an eror in putting in ruler
if "ner" in nlp.pipe_names:
    # If entity_ruler already exists, simply add patterns to it.
    try:
        ruler = nlp.get_pipe("entity_ruler")
    except Exception:
        ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(symbol)
else:
    # If the NER component does not exist, add both the EntityRuler and the NER component.
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(symbol)
    ner = nlp.add_pipe("ner")


In [51]:
# check pipe names
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler',
 'ner']

## 5. Test the EntityRuler

In [52]:
#tests correct implementation with doc1
doc1 = nlp("Helmerich & Payne (HP) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Energy Equipment & Services sector. In contrast, Check-Cap (CHEK) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions. Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, outperforming its peers in the Biotechnology space. Sequans Communications (SQNS) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Semiconductors & Semiconductor Equipment industry.")
displacy.render(doc1, style="ent", jupyter=True)

In [53]:
#tests correct implementation with doc2
doc2 = nlp("Aemetis (AMTX) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Ferro Corporation (FOE) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions. Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, outperforming its peers in the Software space. ACI Worldwide (ACIW) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Software industry.")
displacy.render(doc2, style="ent", jupyter=True)

In [54]:
#tests correct implementation with doc3
doc3= nlp("On a mixed trading day, Par Pacific Holdings (PARR) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Nano Dimension (NNDM) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings, outperforming its peers in the Food Products space. Apollo Investment (AINV) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Capital Markets industry.")
displacy.render(doc3, style="ent", jupyter=True)