# Task3: Named Entity Detection

Preprocess the following text dataset for a Named Entity Recognition (NER) task by performing NLP techniques such as tokenization, sentence segmentation, and annotating each token with the appropriate BIO (Beginning, Inside, Outside) tags according to the entities present (PERSON, ORGANIZATION, LOCATION, MISC)

Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion/data?select=test.txt





In [1]:
from IPython.display import clear_output

In [None]:
!pip install sklearn-crfsuite
clear_output()

In [3]:
!pip install opendatasets
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download alaakhaled/conll003-englishversion
!unzip /content/conll003-englishversion.zip
clear_output()

In [24]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from nltk.corpus.reader import ConllCorpusReader

In [6]:
path = '/content/'

train = ConllCorpusReader(path, 'train.txt', ['words', 'pos', 'chunk', 'ne'])
valid = ConllCorpusReader(path, 'valid.txt', ['words', 'pos', 'chunk', 'ne'])
test = ConllCorpusReader(path, 'test.txt', ['words', 'pos', 'chunk', 'ne'])

In [10]:
def read_conll_data(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():
                word, pos, chunk, ne = line.strip().split()
                sentence.append((word, pos, chunk, ne))
            else:
                sentences.append(sentence)
                sentence = []
        if sentence:
            sentences.append(sentence)
    return sentences

train_data = read_conll_data('train.txt')
valid_data = read_conll_data('valid.txt')
test_data = read_conll_data('test.txt')


In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, chunk, label in sent]

def sent2tokens(sent):
    return [token for token, postag, chunk, label in sent]

In [None]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]
X_valid = [sent2features(s) for s in valid_data]
y_valid = [sent2labels(s) for s in valid_data]
X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [14]:

crf_model = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf_model.fit(X_train, y_train)

y_pred = crf_model.predict(X_test)

In [17]:
labels = list(crf_model.classes_)
labels.remove('O')
labels

['O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [15]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.851     0.814     0.832      1668
       I-LOC      0.744     0.646     0.692       257
      B-MISC      0.817     0.764     0.789       702
      I-MISC      0.702     0.667     0.684       216
       B-ORG      0.772     0.728     0.749      1661
       I-ORG      0.684     0.734     0.708       835
       B-PER      0.828     0.851     0.839      1617
       I-PER      0.866     0.949     0.905      1156

   micro avg      0.805     0.801     0.803      8112
   macro avg      0.783     0.769     0.775      8112
weighted avg      0.805     0.801     0.802      8112



In [23]:
# Sample sentence for prediction
input_sentence = 'EU rejects German call to boycott British lamb.'
tokens = input_sentence.split()

# Preprocess the sample sentence
sample_features = sent2features(tokens)

# Predict using the trained CRF model
sample_prediction = crf_model.predict([sample_features])[0]

# Display the results
for word, label in zip(tokens, sample_prediction):
    print(f"{word}: {label}")


EU: O
rejects: O
German: O
call: O
to: O
boycott: O
British: O
lamb.: O
