# Project : Web scrapping, knowledge base construction

## Part 1 : Web scrapping and knowledge base construction

In [1]:
from datasets import load_dataset
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import inflect

from nltk.stem.porter import PorterStemmer

from nltk.stem import WordNetLemmatizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

nltk.download()

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from datasets import load_dataset

import spacy
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support

import pandas as pd

import json

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Environment Setup

#### Datasets
We laod the CoNLL-2003 from Hugging Face

In [2]:
dataset = load_dataset("conll2003", trust_remote_code=True)

# Access the training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Print the first example from the training set
print(train_dataset[0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


### Task 1 : Model for NER

#### 1. Text cleaning and prepocessing

In [3]:
def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

p = inflect.engine()
# convert number into words
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []

    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        # append the word as it is
        else:
            new_string.append(word)

    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

def replace_non_alphabetic_with_whitespace(text):
    modified_text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    return modified_text

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_whitespace(text):
    return  " ".join(text.split())

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems


nltk.download('punkt')
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
    return lemmas

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def preprocess_pipeline(example):
    text = " ".join(example['tokens'])

    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_non_alphabetic_with_whitespace(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)

    # Convertir la liste en chaîne après avoir supprimé les stopwords
    text = " ".join(remove_stopwords(text))
    text = " ".join(stem_words(text))
    text = " ".join(lemma_words(text))
    
    processed_example = {'tokens': text.split(), 'ner_tags': example['ner_tags']}
    return processed_example

In [5]:
train_processed = train_dataset.map(preprocess_pipeline)
validation_processed = validation_dataset.map(preprocess_pipeline)
test_processed = test_dataset.map(preprocess_pipeline)

print(train_processed[:5])

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

{'id': ['0', '1', '2', '3', '4'], 'tokens': [['eu', 'reject', 'german', 'call', 'boycott', 'british', 'lamb'], ['peter', 'blackburn'], ['brussel'], ['european', 'commiss', 'said', 'thursday', 'disagre', 'german', 'advic', 'consum', 'shun', 'british', 'lamb', 'scientist', 'determin', 'whether', 'mad', 'cow', 'diseas', 'transmit', 'sheep'], ['germani', 'repres', 'european', 'union', 'veterinari', 'committe', 'werner', 'zwingmann', 'said', 'wednesday', 'consum', 'buy', 'sheepmeat', 'countri', 'britain', 'scientif', 'advic', 'clearer']], 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22], [22, 11], [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7], [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12], [11, 12], [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17

#### 2. Named entity recognition (NER)

##### Conditional Random Field (CRF) with sklearn_crfsuite

In [6]:
from itertools import chain

# Define features and labels for training
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        if isinstance(word1, str):  # Vérifiez que word1 est une chaîne de caractères
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            print(f"Warning: word1 is not a string at index {i+1}: {word1}")
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

pos_tag_dict = dataset["train"].features["pos_tags"].feature.int2str
ner_tag_dict = dataset["train"].features["ner_tags"].feature.int2str

# Convert dataset into structured format with string POS and NER tags
train_sents = [
    list(zip(tokens, map(pos_tag_dict, pos_tags), map(ner_tag_dict, ner_tags)))
    for tokens, pos_tags, ner_tags in zip(train_dataset["tokens"], train_dataset["pos_tags"], train_dataset["ner_tags"])
]

test_sents = [
    list(zip(tokens, map(pos_tag_dict, pos_tags), map(ner_tag_dict, ner_tags)))
    for tokens, pos_tags, ner_tags in zip(test_dataset["tokens"], test_dataset["pos_tags"], test_dataset["ner_tags"])
]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)


# Evaluate the model
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

y_test_flat = list(chain(*y_test))
y_pred_flat = list(chain(*y_pred))

# Calculer les métriques
precision_crf, recall_crf, f1_score_crf, _ = precision_recall_fscore_support(
    y_test_flat, y_pred_flat, average='weighted'
)

print(f"Precision: {precision_crf:.4f}, Recall: {recall_crf:.4f}, F1-score: {f1_score_crf:.4f}")

              precision    recall  f1-score   support

       B-LOC       0.86      0.80      0.83      1668
      B-MISC       0.83      0.75      0.79       702
       B-ORG       0.77      0.73      0.75      1661
       B-PER       0.83      0.85      0.84      1617
       I-LOC       0.82      0.66      0.73       257
      I-MISC       0.71      0.68      0.69       216
       I-ORG       0.69      0.76      0.72       835
       I-PER       0.87      0.95      0.91      1156
           O       0.99      0.99      0.99     38323

    accuracy                           0.96     46435
   macro avg       0.82      0.80      0.80     46435
weighted avg       0.96      0.96      0.96     46435

Precision: 0.9557, Recall: 0.9559, F1-score: 0.9556


##### spaCy

In [7]:
true_entities = []
pred_entities = []
nlp = spacy.load("./best_ner_model")

for text, true_labels in zip(test_dataset["tokens"], test_dataset["ner_tags"]):
    text_str = " ".join(text)
    doc = nlp(text_str)
    
    # Convert true labels to named entity format
    true_labels = [ner_tag_dict(label) for label in true_labels]  
    true_entities.append(true_labels)
    
    # Initialiser les labels prédits avec 'O' (Outside)
    predicted_labels = ["O"] * len(text)
    
    # Associer les entités détectées aux tokens
    for ent in doc.ents:
        ent_text = ent.text.split()  # Séparer en tokens
        for token in ent_text:
            if token in text:  # Vérifier si le token est bien dans le texte d'origine
                idx = text.index(token)  # Trouver l'index du token
                predicted_labels[idx] = ent.label_  # Assigner l'entité
        
    pred_entities.append(predicted_labels)

precision_spaCy, recall_spaCy, f1_spaCy, _ = precision_recall_fscore_support(
    [label for sent in true_entities for label in sent], 
    [label for sent in pred_entities for label in sent], 
    average='weighted'
)

print(f"spaCy NER Model - Precision: {precision_spaCy:.4f}, Recall: {recall_spaCy:.4f}, F1-score: {f1_spaCy:.4f}")



spaCy NER Model - Precision: 0.8090, Recall: 0.8168, F1-score: 0.8129


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### Comparition of the performances

In [8]:
metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1-Score'],
    'CRF': [precision_crf, recall_crf, f1_score_crf],
    'spaCy': [precision_spaCy, recall_spaCy, f1_spaCy]
})

metrics_df

Unnamed: 0,Metric,CRF,spaCy
0,Precision,0.955725,0.808966
1,Recall,0.955938,0.816776
2,F1-Score,0.95562,0.812852


We can see that the CRF model has a better score for these 3 metrics, which means that CRF performs better than spaCy overall for the named entity recognition task on our dataset.

##### Saving the extracted entities along their positions

In [9]:
import json

# Extraire les entités prédites par CRF
crf_entities = []
for sent, labels in zip(test_sents, y_pred):
    for i, label in enumerate(labels):
        if label != 'O':  # Exclure les tokens non annotés
            crf_entities.append({
                "text": sent[i][0],
                "entity": label,
                "position": i
            })

# Sauvegarde en JSON
with open("crf_entities.json", "w") as f:
    json.dump(crf_entities, f, indent=4)

print("Entities extracted from CRF model saved in crf_entities.json")

Entities extracted from CRF model saved in crf_entities.json


In [10]:
spacy_entities = []

for text in test_dataset["tokens"]:
    sentence = " ".join(text)
    doc = nlp(sentence)
    for ent in doc.ents:
        spacy_entities.append({
            "text": ent.text,
            "entity": ent.label_,
            "start": ent.start_char,
            "end": ent.end_char
        })

# Sauvegarde en JSON
with open("spacy_entities.json", "w") as f:
    json.dump(spacy_entities, f, indent=4)

print("Entities extracted from spaCy model saved in spacy_entities.json")

Entities extracted from spaCy model saved in spacy_entities.json


#### 3. Relation Extraction (RE)

In [11]:
def extract_relations_from_dataset(dataset):
    nlp = spacy.load("en_core_web_sm")
    relations = []
    
    for example in dataset:
        text = " ".join(example["tokens"])  # Reconstituer la phrase à partir des tokens
        doc = nlp(text)
        
        for token in doc:
            if (token.dep_ == "nsubj" or token.dep_ == "nsubjpass") and token.head.dep_ == "ROOT":
                subject = token.text
                predicate = token.head.text  # Verbe principal                
                for child in token.head.children:
                    if child.dep_ in ["prep", "agent"]:  # Préposition ou agent
                        for obj in child.children:
                            if obj.dep_ == "pobj":  # Objet de la préposition
                                relations.append((subject, predicate, obj.text))
    
    return relations

# Exemple d'utilisation avec un dataset
from datasets import load_dataset

dataset = load_dataset("conll2003", trust_remote_code=True)
train_dataset = dataset['train']

extracted_relations = extract_relations_from_dataset(train_dataset)
print("Relations extraites:", extracted_relations[:10])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

#### 4. Knowledge graph building

In [None]:
def knowledge_graph_from_relations(extracted_relations, word_to_find):
    # Create a new RDF graph
    g = Graph()

    # Define namespaces
    EX = Namespace("http://example.org/")

    # Ajouter les triplets au graphe
    for subj, pred, obj in extracted_relations:
        g.add((URIRef(EX[subj]), URIRef(EX[pred]), URIRef(EX[obj])))

    # Afficher les triplets RDF
    # print(g.serialize(format="turtle"))


    # Perform a SPARQL query
    query = """
    PREFIX ex: <http://example.org/>
    SELECT ?subject ?predicate
    WHERE {
        ?subject ?predicate <http://example.org/""" + word_to_find + """>
    }
    """
    for row in g.query(query):
        print(f"{row.subject} {row.predicate} ")

In [None]:
knowledge_graph_from_relations(extracted_relations, "Fischler")

http://example.org/He http://example.org/said 


In [39]:
text_star_wars = "Star Wars IV is a Movie where there are different kinds of creatures, like humans and wookies. Some creatures are Jedis; for instance, the human Luke is a Jedi, and Master Yoda - for whom the species is not known - is also a Jedi. The wookie named Chewbacca is Han's co-pilot on the Millennium Falcon starship. The speed of Millennium Falcon is 1.5 (above the speed of light!)"


In [40]:
# Load spaCy's pre-trained NER model
nlp = spacy.load("./best_ner_model")

# Process the text with spaCy
doc = nlp(text_star_wars)

# Extract named entities
entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
print("Extracted Entities:", entities)



Extracted Entities: [('Star Wars IV', 'ORG', 0, 12), ('Movie', 'PER', 18, 23), ('Jedis', 'PER', 114, 119), ('Luke', 'PER', 145, 149), ('Jedi', 'PER', 155, 159), ('Master Yoda', 'PER', 165, 176), ('Jedi', 'PER', 225, 229), ('Chewbacca', 'LOC', 248, 257), ('Millennium Falcon', 'ORG', 283, 300), ('Millennium Falcon', 'PER', 324, 341)]


In [41]:
def extract_relations_from_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    relations = []
    for token in doc:
        if token.dep_ in ["nsubj", "nsubjpass"] and token.head.pos_ in ["VERB", "AUX", "ROOT"]:
            subject = token.text
            predicate = token.head.text
            # Explorer les compléments pour identifier les objets et attributs
            for child in token.head.children:
                if child.dep_ in ["dobj", "attr", "acomp"]:
                    relations.append((subject, predicate, child.text))
                
                elif child.dep_ == "prep" or child.dep_ == "agent":  # Gérer les relations prépositionnelles
                    for obj in child.children:
                        if obj.dep_ == "pobj":
                            relations.append((subject, f"{predicate} {child.text}", obj.text))

        # Gérer les relations attributives directes (ex: "Luke is a Jedi")
        if token.dep_ in ["attr", "appos"] and token.head.dep_ in ["nsubj", "nsubjpass"]:
            relations.append((token.head.text, "is", token.text))
    
    return relations

extracted_relations = extract_relations_from_text(text_star_wars)
print("Relations extraites:", extracted_relations)

Relations extraites: [('IV', 'is', 'Movie'), ('creatures', 'are', 'Jedis'), ('Luke', 'is for', 'instance'), ('Luke', 'is', 'Jedi'), ('Yoda', 'is', 'Jedi'), ('species', 'known for', 'whom'), ('wookie', 'is', 'co'), ('wookie', 'is', '-'), ('wookie', 'is', 'pilot'), ('wookie', 'is on', 'starship'), ('speed', 'is', '1.5')]


### Task 2 : Pipeline for knowledge graph construction

#### 1. Fetch news articles

In [13]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os

NB_BOOKS = 10  # Number of books to scrape
EXPORT_PATH = "./books/"

url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"  # Best Books Ever list

options = webdriver.FirefoxOptions()
options.headless = True  
browser = webdriver.Firefox(options=options)

try:
    browser.get(url)
    time.sleep(5)  # To allow time for page to load

    books_data = []
    while len(books_data) < NB_BOOKS:
        soup = BeautifulSoup(browser.page_source, 'html.parser')

        # Find book listings
        for book_item in soup.select('a.bookTitle')[:NB_BOOKS - len(books_data)]:
            title = book_item.get_text(strip=True)
            link = "https://www.goodreads.com" + book_item['href']

            books_data.append((title, link))

        # If we haven't scraped enough books, go to the next page
        next_page = soup.select_one('.next a')
        if next_page:
            next_url = "https://www.goodreads.com" + next_page['href']
            browser.get(next_url)
            time.sleep(5)
        else:
            break

    
    if not os.path.exists(EXPORT_PATH):
        os.makedirs(EXPORT_PATH)

    for idx, (title, link) in enumerate(books_data[:NB_BOOKS]):
        browser.get(link)
        time.sleep(3) 

        book_soup = BeautifulSoup(browser.page_source, 'html.parser')

        author_tag = book_soup.select_one('.BookPageMetadataSection__contributor .ContributorLink__name')
        author = author_tag.get_text(strip=True) if author_tag else "Unknown"

        date_tag = book_soup.select_one('.BookDetails .FeaturedDetails p[data-testid="publicationInfo"]')
        release_date = date_tag.get_text(strip=True) if date_tag else "Unknown"

        summary_tag = book_soup.select_one('.BookPageMetadataSection__description')
        summary = summary_tag.get_text("\n", strip=True) if summary_tag else "No summary available."
        
        summary = summary.replace("Show more", "").strip()
        release_date=release_date.replace("First published", "").strip()

        # Save to file
        filename = f"{EXPORT_PATH}book_{idx+1}.txt"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(f"Title: {title}\n")
            f.write(f"Author: {author}\n")
            f.write(f"Release Date: {release_date}\n")
            f.write(f"URL: {link}\n\n")
            f.write(f"Summary:\n{summary}")

        print(f"Saved: {filename}")

finally:
    browser.quit()

print("Scraping complete!")

Saved: ./books/book_1.txt
Saved: ./books/book_2.txt
Saved: ./books/book_3.txt
Saved: ./books/book_4.txt
Saved: ./books/book_5.txt
Saved: ./books/book_6.txt
Saved: ./books/book_7.txt
Saved: ./books/book_8.txt
Saved: ./books/book_9.txt
Saved: ./books/book_10.txt
Scraping complete!


#### 2. Use methods from Task 1

In [15]:
#Pre-processing

INPUT_DIR = "./books/"

original_summaries = []
processed_summaries = []

def preprocess_pipeline_for_str(text):
    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_non_alphabetic_with_whitespace(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)

    # Convertir la liste en chaîne après avoir supprimé les stopwords
    text = " ".join(remove_stopwords(text))
    text = " ".join(stem_words(text))
    text = " ".join(lemma_words(text))
    return text

for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".txt"):
        file_path = os.path.join(INPUT_DIR, filename)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Extract the summary 
        summary_start = content.find("Summary:")
        if summary_start != -1:
            summary_text = content[summary_start + len("Summary:"):].strip()
        else:
            summary_text = "No summary available."

        original_summaries.append(summary_text)

        cleaned_summary = preprocess_pipeline_for_str(summary_text)

        processed_summaries.append(cleaned_summary)

# Print results
for i in range(len(original_summaries)):
    print(f"\nBook {i+1}:")
    print(f"\nOriginal Summary:\n{original_summaries[i]}")
    print(f"\nProcessed Summary:\n{processed_summaries[i]}")




Book 1:

Original Summary:
Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning?
In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.
Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.

Processed Summary:
could surviv wild everi one make sure l

In [37]:
nlp = spacy.load("./best_ner_model") 


extracted_entities = []

for summary in original_summaries:
    doc = nlp(summary) 

    entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
    extracted_entities.append(entities)

for i in range(len(extracted_entities)): 
    print(f"Book {i+1}: ",extracted_entities[i])


Book 1:  [('North America', 'LOC', 148, 161), ('Panem', 'PER', 181, 186), ('Capitol', 'PER', 198, 205), ('Hunger Games', 'MISC', 428, 440), ('Katniss Everdeen', 'PER', 492, 508), ('Games', 'MISC', 650, 655), ('Katniss', 'MISC', 661, 668)]
Book 2:  [('Hazel', 'ORG', 77, 82), ('Augustus Waters', 'ORG', 202, 217), ('Hazel', 'ORG', 264, 269), ('Insightful', 'MISC', 315, 325), ('The Fault', 'ORG', 354, 363), ('John Green', 'PER', 401, 411)]
Book 3:  [('Harry Potter', 'PER', 0, 12), ('Hogwarts', 'LOC', 49, 57), ('Wizardry', 'PER', 83, 91), ('Harry', 'PER', 117, 122), ('Dursleys', 'MISC', 203, 211), ('Hermione', 'PER', 289, 297), ('Harry', 'PER', 326, 331), ('Harry', 'PER', 503, 508), ('Hogwarts', 'PER', 549, 557)]
Book 4:  [('Pride', 'ORG', 37, 42), ('English', 'MISC', 108, 115), ('Jane Austen', 'PER', 126, 137), ('Elizabeth Bennet', 'PER', 216, 232), ('Elizabeth', 'ORG', 331, 340), ('Darcy', 'PER', 365, 370), ('Jane Austen', 'PER', 425, 436), ('Regency England', 'LOC', 587, 602)]
Book 5:  [

In [44]:
extracted_relations=[]
for summary in original_summaries:
    relations=extract_relations_from_text(summary)
    extracted_relations.append(relations)
    
for i in range(len(extracted_relations)):    
    print(f"Relations extraites for book {i+1}: ", extracted_relations[i])

Relations extraites for book 1:  [('you', 'survive on', 'own'), ('you', 'survive in', 'wild'), ('Capitol', 'is', 'harsh'), ('Everdeen', 'regards', 'it'), ('Everdeen', 'regards as', 'sentence'), ('who', 'lives', 'alone'), ('Katniss', 'been', 'close'), ('survival', 'is for', 'her'), ('survival', 'is', 'nature'), ('she', 'becomes', 'contender')]
Relations extraites for book 2:  [('that', 'bought', 'her'), ('Hazel', 'been Despite', 'miracle'), ('Hazel', 'been', 'anything'), ('chapter', 'inscribed upon', 'diagnosis'), ('twist', 'appears at', 'Group'), ('story', 'is', 'about'), ('irreverent', 'is', 'work'), ('irreverent', 'is', 'Fault')]
Relations extraites for book 3:  [('Potter', 'is', 'about'), ('Harry', 'enjoys Unlike', 'schoolboys'), ('Harry', 'enjoys', 'holidays'), ('summer', 'is', 'worse'), ('Dursleys', 'making of', 'course'), ('Harry', 'had', 'enough'), ('he', 'do', 'something'), ('holidays', 'come to', 'end'), ('holidays', 'come in', 'fashion'), ('Harry', 'is', 'about')]
Relations e

In [46]:
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
displacy.render(nlp(original_summaries[0]), style="dep", jupyter=True, options={"compact": True, "distance": 100})

In [54]:
import os

# Define the folder where your txt files are located
folder_path = 'books'

# Initialize a list to store the book titles
book_titles = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith("Title:"):
                    # Extract the title after 'Title: ' and strip newline characters
                    title = line.split("Title:")[1].strip()
                    book_titles.append(title)
                    break  # Stop reading after the title is found

# Print the list of extracted book titles
print(book_titles)


['The Hunger Games (The Hunger Games, #1)', 'The Fault in Our Stars', 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)', 'Pride and Prejudice', 'To Kill a Mockingbird', 'The Book Thief', 'Twilight (The Twilight Saga, #1)', 'Animal Farm', 'J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings', 'The Chronicles of Narnia (The Chronicles of Narnia, #1-7)']


In [58]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF

# Initialize graph
g = Graph()

# Define namespace
EX = Namespace("http://example.org/")
g.bind("ex", EX)


# Loop over each book's relations
for i, relations in enumerate(extracted_relations):
    book_name = book_titles[i]
    book_uri = URIRef(EX[book_name.replace(" ", "_")])

    for relation in relations:
        if len(relation) == 3:  # Make sure the relation has subject, predicate, and object
            subject, predicate, obj = relation

            # Create URIs for subject, predicate, and object
            subject_uri = URIRef(EX[subject.replace(" ", "_")])
            predicate_uri = URIRef(EX[predicate.replace(" ", "_")])
            object_uri = URIRef(EX[obj.replace(" ", "_")])

            # Add triples to the graph
            g.add((subject_uri, predicate_uri, object_uri))
            g.add((book_uri, URIRef(EX["contains_relation"]), subject_uri))


print(f"Number of triples in the graph: {len(g)}")
# Serialize the graph to Turtle format (readable)
print(g.serialize(format="turtle"))

Number of triples in the graph: 169
@prefix ex: <http://example.org/> .

ex:Animal_Farm ex:contains_relation ex:Russia,
        ex:comedy,
        ex:farm,
        ex:it,
        ex:stage,
        ex:that,
        ex:they .

<http://example.org/Harry_Potter_and_the_Order_of_the_Phoenix_(Harry_Potter,_#5)> ex:contains_relation ex:Dursleys,
        ex:Harry,
        ex:Potter,
        ex:he,
        ex:holidays,
        ex:summer .

<http://example.org/J.R.R._Tolkien_4-Book_Boxed_Set:_The_Hobbit_and_The_Lord_of_the_Rings> ex:contains_relation ex:Baggins,
        ex:Lord,
        ex:himself,
        ex:masterpiece,
        ex:set .

ex:Pride_and_Prejudice ex:contains_relation ex:Austen,
        ex:Pride,
        ex:characters,
        ex:clash .

ex:The_Book_Thief ex:contains_relation ex:Death,
        ex:It,
        ex:Zusak,
        ex:country,
        ex:family,
        ex:it,
        ex:life,
        ex:she,
        ex:that,
        ex:these,
        ex:title .

<http://example.org/Th

## Part 2: Knowledge Graph Embedding

### Knowledge Graph Embedding

#### 1. Setup PyKEEN