## Code Implementation of Rule-Based Approaches for NER

In [1]:
import re

def rule_based_ner(text):
    entities = []

    # Rule for identifying person names
    person_pattern = re.compile(r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.) ([A-Z][a-z]+)\b')
    persons = re.findall(person_pattern, text)
    entities.extend([(person, 'PERSON') for person in persons])

    # Rule for identifying dates
    date_pattern = re.compile(r'\b(?:\d{1,2}/\d{1,2}/\d{4}|\w+\s\d{1,2},\s\d{4})\b')
    dates = re.findall(date_pattern, text)
    entities.extend([(date, 'DATE') for date in dates])

    return entities

# Example text
example_text = "Mr. Ahmed and Mr. Mohamed visited Cairo on 01/15/2023."

# Apply the rule-based NER
ner_results = rule_based_ner(example_text)

# Display the identified entities
for entity, entity_type in ner_results:
    print(f"Entity: {entity}, Type: {entity_type}")

Entity: Ahmed, Type: PERSON
Entity: Mohamed, Type: PERSON
Entity: 01/15/2023, Type: DATE


## Code Implementation of NER With Decision Trees

In [2]:
import spacy
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import conll2002

# Download the NLTK dataset (if not already downloaded)
nltk.download('conll2002')

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Load the CoNLL-2002 NER dataset from NLTK
data = conll2002.iob_sents('esp.train')

# Convert the CoNLL-2002 format to a list of (word, label) tuples for each token in each sentence
corpus = []
for sentence in data:
    for token, pos, label in sentence:
        # Use each word as a separate training example, maintaining the entity label
        corpus.append((token, label))

# Extract features and labels
X, y = zip(*corpus)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and DecisionTreeClassifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Train the decision tree model
pipeline.fit(X_train, y_train)

# Test the model
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Example usage on a new text
new_text = "Microsoft is launching a new product"
predicted_entities = pipeline.predict(new_text.split())
print(f"Predicted Entities for '{new_text}': {predicted_entities}")

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.68      0.74      0.71       983
      B-MISC       0.60      0.30      0.40       475
       B-ORG       0.81      0.74      0.77      1492
       B-PER       0.69      0.63      0.66       828
       I-LOC       0.63      0.24      0.34       385
      I-MISC       0.42      0.13      0.20       645
       I-ORG       0.55      0.17      0.26      1009
       I-PER       0.65      0.38      0.48       790
           O       0.95      0.99      0.97     46336

    accuracy                           0.93     52943
   macro avg       0.66      0.48      0.53     52943
weighted avg       0.91      0.93      0.91     52943

Predicted Entities for 'Microsoft is launching a new product': ['B-ORG' 'O' 'O' 'O' 'B-MISC' 'O']
