## 1: Setup Environment

In [29]:
# Importing the required libraries for working with spaCy and pandas:
import spacy 
import pandas as pd

## 2: Load the Dataset

In [None]:
# Loading the dataset into a pandas DataFrame:
df = pd.read_csv('stocks-1.tsv', sep="\t")
# Identifying the columns containing company names and stock symbol:
df.head()

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M


In [12]:
df.columns

Index(['Symbol', 'CompanyName', 'Industry', 'MarketCap'], dtype='object')

## 3: Extract Data for Patterns

In [None]:
# Extracting unique company names and stock symbols from the appropriate columns:
company_names = df['CompanyName'].unique()
stock_symbols = df['Symbol'].unique()

In [None]:
# Creating patterns for each to be recognized by spaCy's EntityRuler particularly using a for loop:
company_patterns = [{'label': 'Company', 'pattern': name} for name in company_names]
stock_symbol_patterns = [{'label': 'Stock Symbol', 'pattern': symbol} for symbol in stock_symbols]
# Combining the patters for simplicity:
combined_patterns = company_patterns + stock_symbol_patterns

## 4: Create an EntityRuler

In [None]:
# Using a spaCy language model to create an EntityRuler:
from spacy.pipeline import EntityRuler
nlp = spacy.load('en_core_web_sm')
entity_ruler = nlp.add_pipe('entity_ruler', before='ner') # before= "ner" makes sure this entity ruler is not stuck at the very end, which would mean our custom entity rule would run last and not be applied to the text.

In [None]:
# Adding the patterns for both companies and stock symbols to the EntityRuler pipeline:
entity_ruler.add_patterns(combined_patterns)

## 5: Test the EntityRuler

In [22]:
from spacy import displacy

In [30]:
# Applying EntityRuler to the text to check if it correctly identifies the entities:

paragraph_1 = nlp(
    """Helmerich & Payne (HP) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Energy Equipment & Services sector. In contrast, Check-Cap (CHEK) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, outperforming its peers in the Biotechnology space. Sequans Communications (SQNS) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Semiconductors & Semiconductor Equipment industry.""")
displacy.render(paragraph_1, style="ent", jupyter=True)

# It seems to be working perfectly based off the output below:

In [31]:
paragraph_2 = nlp(
    """Aemetis (AMTX) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Ferro Corporation (FOE) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, outperforming its peers in the Software space. ACI Worldwide (ACIW) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Software industry.""")
displacy.render(paragraph_2, style="ent", jupyter=True)

# Still working great, no mistakes:

In [32]:
paragraph_3 = nlp(
    """On a mixed trading day, Par Pacific Holdings (PARR) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Nano Dimension (NNDM) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings, outperforming its peers in the Food Products space. Apollo Investment (AINV) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Capital Markets industry.""")
displacy.render(paragraph_3, style="ent", jupyter=True)

# No mistakes here either: