## Evaluation Criteria

In [2]:
from datasets import load_dataset
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import inflect

from nltk.stem.porter import PorterStemmer

from nltk.stem import WordNetLemmatizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

# nltk.download()

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from datasets import load_dataset

import spacy
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support

import pandas as pd

import json

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Basic Implementation (3 points)

#### Functional web scraping script with at least 10 articles

In [None]:
def scrape_article():
    text_article = "Bonjour !"
    return text_article

#### Clean text preprocessing (no HTML tags, proper sentence structure)

In [3]:
def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

p = inflect.engine()
# convert number into words
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []

    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        # append the word as it is
        else:
            new_string.append(word)

    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

def replace_non_alphabetic_with_whitespace(text):
    modified_text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    return modified_text

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_whitespace(text):
    return  " ".join(text.split())

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems


nltk.download('punkt')
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
    return lemmas


def preprocess_pipeline(text):
    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_non_alphabetic_with_whitespace(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)

    # Convertir la liste en chaîne après avoir supprimé les stopwords
    text = " ".join(remove_stopwords(text))
    text = " ".join(stem_words(text))
    text = " ".join(lemma_words(text))
    
    processed_example = {'tokens': text.split()}
    return processed_example

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\auria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
text_star_wars = "Star Wars IV is a Movie where there are different kinds of creatures, like humans and wookies. Some creatures are Jedis; for instance, the human Luke is a Jedi, and Master Yoda - for whom the species is not known - is also a Jedi. The wookie named Chewbacca is Han's co-pilot on the Millennium Falcon starship. The speed of Millennium Falcon is 1.5 (above the speed of light!)"

print(preprocess_pipeline(text_star_wars))

{'tokens': ['star', 'war', 'iv', 'movi', 'differ', 'kind', 'creatur', 'like', 'human', 'wooki', 'creatur', 'jedi', 'instanc', 'human', 'luke', 'jedi', 'master', 'yoda', 'speci', 'known', 'also', 'jedi', 'wooki', 'name', 'chewbacca', 'han', 'co', 'pilot', 'millennium', 'falcon', 'starship', 'speed', 'millennium', 'falcon', 'speed', 'light']}


#### Complete pipeline from scraping to graph construction

In [None]:
from spacy import displacy

def pipeline_scrap_to_graph():
    text_article = scrape_article()
    nlp = spacy.load("en_core_web_sm")
    displacy.render(nlp(text_article), style="dep", jupyter=True, options={"compact": True, "distance": 100})

### Named Entity Recognition (3 points)

#### Number of unique entities identified (>50 entities: full points)

### Relation Extraction (3 points)

#### Basic relation extraction implementation

In [None]:
def extract_relations_from_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    relations = []
    for token in doc:
        if token.dep_ in ["nsubj", "nsubjpass"] and token.head.pos_ in ["VERB", "AUX", "ROOT"]:
            subject = token.text
            predicate = token.head.text
            # Explorer les compléments pour identifier les objets et attributs
            for child in token.head.children:
                if child.dep_ in ["dobj", "attr", "acomp"]:
                    relations.append((subject, predicate, child.text))
                
                elif child.dep_ == "prep" or child.dep_ == "agent":  # Gérer les relations prépositionnelles
                    for obj in child.children:
                        if obj.dep_ == "pobj":
                            relations.append((subject, f"{predicate} {child.text}", obj.text))

        # Gérer les relations attributives directes (ex: "Luke is a Jedi")
        if token.dep_ in ["attr", "appos"] and token.head.dep_ in ["nsubj", "nsubjpass"]:
            relations.append((token.head.text, "is", token.text))

#### At least 3 custom rules (with examples extracted from the text)

### Knowledge Graph Quality (2 points)

#### All triples in valid RDF format using proper namespaces

#### At least 50 meaningful triples in the graph

### Entity Linking (2 points)

#### Working DBpedia/Wikidata linking implementation

#### > 50% entities linked to external knowledge bases

#### Documentation of disambiguation strategy

### Knowledge Graph Embedding (3 points)

#### Model training with at least 2 different architectures

#### How the performance is improved by data enhencement

### Link Prediction (2 points)

#### Evaluation using standard metrics (MRR, Hits@k)