# Project : Web scrapping, knowledge base construction

## Part 1 : Web scrapping and knowledge base construction

In [None]:
from datasets import load_dataset
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import inflect

from nltk.stem.porter import PorterStemmer

from nltk.stem import WordNetLemmatizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

# nltk.download()

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from datasets import load_dataset

import spacy
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support

import pandas as pd

import json

### Environment Setup

#### Datasets
We laod the CoNLL-2003 from Hugging Face

In [85]:
dataset = load_dataset("conll2003", trust_remote_code=True)

# Access the training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Print the first example from the training set
print(train_dataset[0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


### Task 1 : Model for NER

#### 1. Text cleaning and prepocessing

In [86]:
def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

p = inflect.engine()
# convert number into words
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []

    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        # append the word as it is
        else:
            new_string.append(word)

    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

def replace_non_alphabetic_with_whitespace(text):
    modified_text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    return modified_text

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_whitespace(text):
    return  " ".join(text.split())

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems


nltk.download('punkt')
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
    return lemmas

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [87]:
def preprocess_pipeline(example):
    text = " ".join(example['tokens'])

    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_non_alphabetic_with_whitespace(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)

    # Convertir la liste en chaîne après avoir supprimé les stopwords
    text = " ".join(remove_stopwords(text))
    text = " ".join(stem_words(text))
    text = " ".join(lemma_words(text))

    processed_example = {'tokens': text.split(), 'ner_tags': example['ner_tags']}
    return processed_example

In [88]:
train_processed = train_dataset.map(preprocess_pipeline)
validation_processed = validation_dataset.map(preprocess_pipeline)
test_processed = test_dataset.map(preprocess_pipeline)

print(train_processed[:5])

{'id': ['0', '1', '2', '3', '4'], 'tokens': [['eu', 'reject', 'german', 'call', 'boycott', 'british', 'lamb'], ['peter', 'blackburn'], ['brussel'], ['european', 'commiss', 'said', 'thursday', 'disagre', 'german', 'advic', 'consum', 'shun', 'british', 'lamb', 'scientist', 'determin', 'whether', 'mad', 'cow', 'diseas', 'transmit', 'sheep'], ['germani', 'repres', 'european', 'union', 'veterinari', 'committe', 'werner', 'zwingmann', 'said', 'wednesday', 'consum', 'buy', 'sheepmeat', 'countri', 'britain', 'scientif', 'advic', 'clearer']], 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22], [22, 11], [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7], [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12], [11, 12], [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17

#### 2. Named entity recognition (NER)

##### Conditional Random Field (CRF) with sklearn_crfsuite

In [89]:
# Define features and labels for training
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        if isinstance(word1, str):  # Vérifiez que word1 est une chaîne de caractères
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            print(f"Warning: word1 is not a string at index {i+1}: {word1}")
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

pos_tag_dict = dataset["train"].features["pos_tags"].feature.int2str
ner_tag_dict = dataset["train"].features["ner_tags"].feature.int2str

# Convert dataset into structured format with string POS and NER tags
train_sents = [
    list(zip(tokens, map(pos_tag_dict, pos_tags), map(ner_tag_dict, ner_tags)))
    for tokens, pos_tags, ner_tags in zip(train_dataset["tokens"], train_dataset["pos_tags"], train_dataset["ner_tags"])
]

test_sents = [
    list(zip(tokens, map(pos_tag_dict, pos_tags), map(ner_tag_dict, ner_tags)))
    for tokens, pos_tags, ner_tags in zip(test_dataset["tokens"], test_dataset["pos_tags"], test_dataset["ner_tags"])
]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)


# Evaluate the model
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-LOC       0.86      0.80      0.83      1668
      B-MISC       0.83      0.75      0.79       702
       B-ORG       0.77      0.73      0.75      1661
       B-PER       0.83      0.85      0.84      1617
       I-LOC       0.82      0.66      0.73       257
      I-MISC       0.71      0.68      0.69       216
       I-ORG       0.69      0.76      0.72       835
       I-PER       0.87      0.95      0.91      1156
           O       0.99      0.99      0.99     38323

    accuracy                           0.96     46435
   macro avg       0.82      0.80      0.80     46435
weighted avg       0.96      0.96      0.96     46435



##### spaCy

In [90]:
# Load spaCy's pre-trained NER model
nlp = spacy.load("./best_ner_model")

# Example text
text = "Apple was founded by Steve Jobs."

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
print("Extracted Entities:", entities)



Extracted Entities: [('Apple', 'ORG', 0, 5), ('Steve Jobs', 'PER', 21, 31)]


##### Comparition of the performances

###### CRF model

In [91]:
from sklearn_crfsuite import metrics

# Calcul des métriques
precision_crf = metrics.flat_precision_score(y_test, y_pred, average='weighted')
recall_crf = metrics.flat_recall_score(y_test, y_pred, average='weighted')
f1_score_crf = metrics.flat_f1_score(y_test, y_pred, average='weighted')

print(f"CRF Model - Precision: {precision_crf:.4f}, Recall: {recall_crf:.4f}, F1-score: {f1_score_crf:.4f}")

CRF Model - Precision: 0.9557, Recall: 0.9559, F1-score: 0.9556


###### spaCy

In [92]:
true_entities = []
pred_entities = []

for text, true_labels in zip(test_dataset["tokens"], test_dataset["ner_tags"]):
    text_str = " ".join(text)
    doc = nlp(text_str)
    
    # Convert true labels to named entity format
    true_labels = [ner_tag_dict(label) for label in true_labels]  
    true_entities.append(true_labels)
    
    # Initialiser les labels prédits avec 'O' (Outside)
    predicted_labels = ["O"] * len(text)
    
    # Associer les entités détectées aux tokens
    for ent in doc.ents:
        ent_text = ent.text.split()  # Séparer en tokens
        for token in ent_text:
            if token in text:  # Vérifier si le token est bien dans le texte d'origine
                idx = text.index(token)  # Trouver l'index du token
                predicted_labels[idx] = ent.label_  # Assigner l'entité
        
    pred_entities.append(predicted_labels)

precision_spaCy, recall_spaCy, f1_spaCy, _ = precision_recall_fscore_support(
    [label for sent in true_entities for label in sent], 
    [label for sent in pred_entities for label in sent], 
    average='weighted'
)

print(f"spaCy NER Model - Precision: {precision_spaCy:.4f}, Recall: {recall_spaCy:.4f}, F1-score: {f1_spaCy:.4f}")

spaCy NER Model - Precision: 0.8090, Recall: 0.8168, F1-score: 0.8129


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [93]:
metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1-Score'],
    'spaCy': [precision_spaCy, recall_spaCy, f1_spaCy],
    'CRF': [precision_crf, recall_crf, f1_score_crf]
})

metrics_df

Unnamed: 0,Metric,spaCy,CRF
0,Precision,0.808966,0.955725
1,Recall,0.816776,0.955938
2,F1-Score,0.812852,0.95562


We can see that the CRF model has a better score for these 3 metrics, which means that CRF performs better than spaCy overall for the named entity recognition task on our dataset.

##### Saving the extracted entities along their positions

In [96]:
import json

# Extraire les entités prédites par CRF
crf_entities = []
for sent, labels in zip(test_sents, y_pred):
    for i, label in enumerate(labels):
        if label != 'O':  # Exclure les tokens non annotés
            crf_entities.append({
                "text": sent[i][0],
                "entity": label,
                "position": i
            })

# Sauvegarde en JSON
with open("crf_entities.json", "w") as f:
    json.dump(crf_entities, f, indent=4)

print("Entities extracted from CRF model saved in crf_entities.json")

Entities extracted from CRF model saved in crf_entities.json


In [97]:
spacy_entities = []

for text in test_dataset["tokens"]:
    sentence = " ".join(text)
    doc = nlp(sentence)
    for ent in doc.ents:
        spacy_entities.append({
            "text": ent.text,
            "entity": ent.label_,
            "start": ent.start_char,
            "end": ent.end_char
        })

# Sauvegarde en JSON
with open("spacy_entities.json", "w") as f:
    json.dump(spacy_entities, f, indent=4)

print("Entities extracted from spaCy model saved in spacy_entities.json")

Entities extracted from spaCy model saved in spacy_entities.json


#### 3. Relation Extraction (RE)

#### 4. Knowledge graph building

### Task 2 : Pipeline for knowledge graph construction

#### 1. Fetch news articles

#### 2. Use methods from Task 1