# **Install Dependencies**

In [19]:
# Install dependencies
!pip install -q spacy pandas
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and install

# **Google Drive (Mount Drive)**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load and Parse Dataset**

In [None]:
import re
import pandas as pd

In [None]:
def parse_conll(file_path, num_sentences=3):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        count = 0
        for line in f:
            line = line.strip()
            if line.startswith('-DOCSTART-') or not line:
                if sentence:
                    sentences.append(' '.join(sentence))
                    count += 1
                    sentence = []
                if count >= num_sentences:
                    break
                continue
            parts = line.split()
            if len(parts) >= 4:  # Word, POS, Chunk, NER
                word = parts[0]
                sentence.append(word)
        if sentence:
            sentences.append(' '.join(sentence))
    return sentences

In [None]:
# Change this path to your Drive folder
train_path = '/content/drive/MyDrive/conll_dataset/train.txt'

In [None]:
# Load samples
samples = parse_conll(train_path, num_sentences=3)

In [None]:
# Display samples for testing
for idx, text in enumerate(samples):
    print(f"\n=== Sample {idx+1} ===\n{text}\n")


=== Sample 1 ===
EU rejects German call to boycott British lamb .


=== Sample 2 ===
Peter Blackburn


=== Sample 3 ===
BRUSSELS 1996-08-22



# **NER (Define NER Functions)**

In [None]:
import spacy
from spacy import displacy

# Rule-based NER (simple patterns)
def rule_based_ner(text):
    entities = []
    # Capitalized sequences for PER/ORG
    cap_pattern = r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b'
    for match in re.finditer(cap_pattern, text):
        entity = match.group(0)
        if len(entity.split()) > 1 or entity in ['EU', 'German', 'British']:
            entities.append((entity, 'ORG' if 'Corp' in entity else 'PER'))

    # Locations keywords
    loc_keywords = ['London', 'Germany', 'Britain', 'Europe']
    for loc in loc_keywords:
        if loc in text:
            entities.append((loc, 'LOC'))

    entities = list(set(entities))  # Remove duplicates
    return entities

# SpaCy NER
def spacy_ner(text, model_name):
    nlp = spacy.load(model_name)
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities, doc

# Highlight entities in text (with colors)
def highlight_entities(text, entities):
    for entity, label in sorted(entities, key=lambda x: len(x[0]), reverse=True):
        color = '\033[92m' if label in ['PER', 'PERSON'] else '\033[94m' if label in ['ORG'] else '\033[93m'
        text = re.sub(re.escape(entity), f"{color}{entity} ({label})\033[0m", text)
    return text

# **Process Samples and Compare**

In [None]:
# Process each sample
for idx, text in enumerate(samples):
    print(f"\n=== NER for Sample {idx+1} ===\n")

    # Rule-based
    rule_entities = rule_based_ner(text)
    print("Rule-based Entities:", rule_entities)
    print("Highlighted (Rule-based):\n", highlight_entities(text, rule_entities), "\n")

    # en_core_web_sm
    sm_entities, sm_doc = spacy_ner(text, 'en_core_web_sm')
    print("en_core_web_sm Entities:", sm_entities)
    print("Highlighted (sm):\n", highlight_entities(text, sm_entities), "\n")
    displacy.render(sm_doc, style='ent', jupyter=True)  # Visualize

    # en_core_web_lg
    lg_entities, lg_doc = spacy_ner(text, 'en_core_web_lg')
    print("en_core_web_lg Entities:", lg_entities)
    print("Highlighted (lg):\n", highlight_entities(text, lg_entities), "\n")
    displacy.render(lg_doc, style='ent', jupyter=True)  # Visualize

    # Comparison
    print("Comparison:")
    only_sm = set(sm_entities) - set(lg_entities)
    only_lg = set(lg_entities) - set(sm_entities)
    common = set(sm_entities) & set(lg_entities)

    # Convert sets to lists of strings
    only_sm_list = [str(x) for x in only_sm]
    only_lg_list = [str(x) for x in only_lg]
    common_list = [str(x) for x in common]

    # Find the maximum length
    max_len = max(len(only_sm_list), len(only_lg_list), len(common_list))

    # Pad shorter lists with None
    only_sm_padded = only_sm_list + [None] * (max_len - len(only_sm_list))
    only_lg_padded = only_lg_list + [None] * (max_len - len(only_lg_list))
    common_padded = common_list + [None] * (max_len - len(common_list))

    # Create DataFrame with padded lists
    df_comparison = pd.DataFrame({
        'Only in sm': only_sm_padded,
        'Only in lg': only_lg_padded,
        'Common': common_padded
    })
    display(df_comparison)  # Show table in Colab


=== NER for Sample 1 ===

Rule-based Entities: [('German', 'PER'), ('British', 'PER')]
Highlighted (Rule-based):
 EU rejects [92mGerman (PER)[0m call to boycott [92mBritish (PER)[0m lamb . 

en_core_web_sm Entities: [('EU', 'ORG'), ('German', 'NORP'), ('British', 'NORP')]
Highlighted (sm):
 [94mEU (ORG)[0m rejects [93mGerman (NORP)[0m call to boycott [93mBritish (NORP)[0m lamb . 



en_core_web_lg Entities: [('EU', 'ORG'), ('German', 'NORP'), ('British', 'NORP')]
Highlighted (lg):
 [94mEU (ORG)[0m rejects [93mGerman (NORP)[0m call to boycott [93mBritish (NORP)[0m lamb . 



Comparison:


Unnamed: 0,Only in sm,Only in lg,Common
0,,,"('EU', 'ORG')"
1,,,"('German', 'NORP')"
2,,,"('British', 'NORP')"



=== NER for Sample 2 ===

Rule-based Entities: [('Peter Blackburn', 'PER')]
Highlighted (Rule-based):
 [92mPeter Blackburn (PER)[0m 

en_core_web_sm Entities: [('Peter Blackburn', 'PERSON')]
Highlighted (sm):
 [92mPeter Blackburn (PERSON)[0m 



en_core_web_lg Entities: [('Peter Blackburn', 'PERSON')]
Highlighted (lg):
 [92mPeter Blackburn (PERSON)[0m 



Comparison:


Unnamed: 0,Only in sm,Only in lg,Common
0,,,"('Peter Blackburn', 'PERSON')"



=== NER for Sample 3 ===

Rule-based Entities: []
Highlighted (Rule-based):
 BRUSSELS 1996-08-22 

en_core_web_sm Entities: [('BRUSSELS', 'GPE'), ('1996-08-22', 'DATE')]
Highlighted (sm):
 [93mBRUSSELS (GPE)[0m [93m1996-08-22 (DATE)[0m 



en_core_web_lg Entities: [('BRUSSELS', 'GPE'), ('1996-08-22', 'DATE')]
Highlighted (lg):
 [93mBRUSSELS (GPE)[0m [93m1996-08-22 (DATE)[0m 



Comparison:


Unnamed: 0,Only in sm,Only in lg,Common
0,,,"('BRUSSELS', 'GPE')"
1,,,"('1996-08-22', 'DATE')"
