# Project : Web scrapping, knowledge base construction

## Part 1 : Web scrapping and knowledge base construction

In [17]:
from datasets import load_dataset
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import inflect

from nltk.stem.porter import PorterStemmer

from nltk.stem import WordNetLemmatizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Environment Setup

#### Datasets
We laod the CoNLL-2003 from Hugging Face

In [18]:
dataset = load_dataset("conll2003", trust_remote_code=True)

# Access the training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Print the first example from the training set
print(train_dataset[0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


### Task 1 : Model for NER

#### 1. Text cleaning and prepocessing

In [19]:
def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

p = inflect.engine()
# convert number into words
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []

    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        # append the word as it is
        else:
            new_string.append(word)

    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

def replace_non_alphabetic_with_whitespace(text):
    modified_text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    return modified_text

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_whitespace(text):
    return  " ".join(text.split())

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems


nltk.download('punkt')
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
    return lemmas

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
def preprocess_pipeline(example):
    text = " ".join(example['tokens'])

    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_non_alphabetic_with_whitespace(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)

    # Convertir la liste en chaîne après avoir supprimé les stopwords
    text = " ".join(remove_stopwords(text))
    text = " ".join(stem_words(text))
    text = " ".join(lemma_words(text))

    processed_example = {'tokens': text.split(), 'ner_tags': example['ner_tags']}
    return processed_example

In [21]:
train_processed = train_dataset.map(preprocess_pipeline)
validation_processed = validation_dataset.map(preprocess_pipeline)
test_processed = test_dataset.map(preprocess_pipeline)

print(train_processed[:5])

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

{'id': ['0', '1', '2', '3', '4'], 'tokens': [['eu', 'reject', 'german', 'call', 'boycott', 'british', 'lamb'], ['peter', 'blackburn'], ['brussel'], ['european', 'commiss', 'said', 'thursday', 'disagre', 'german', 'advic', 'consum', 'shun', 'british', 'lamb', 'scientist', 'determin', 'whether', 'mad', 'cow', 'diseas', 'transmit', 'sheep'], ['germani', 'repres', 'european', 'union', 'veterinari', 'committe', 'werner', 'zwingmann', 'said', 'wednesday', 'consum', 'buy', 'sheepmeat', 'countri', 'britain', 'scientif', 'advic', 'clearer']], 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22], [22, 11], [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7], [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12], [11, 12], [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17

#### 2. Named entity recognition (NER)

#### 3. Relation Extraction (RE)

#### 4. Knowledge graph building

### Task 2 : Pipeline for knowledge graph construction

#### 1. Fetch news articles

#### 2. Use methods from Task 1