In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm.auto import tqdm
import hashlib

## Extracts articles url, title and category from the sitemap page

In [2]:
def extract_articles_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)

    if response.status_code != 200:
        print(f"Failed to retrieve the sitemap. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # List to store the extracted articles
    articles = []

    # Find all categories (strong with class 'wsp-category-title') and their corresponding articles
    for category_section in soup.find_all('ul', class_='wsp-posts-list'):
        category_title_tag = category_section.find_previous('strong', class_='wsp-category-title')
        
        if category_title_tag:
            category_link = category_title_tag.find('a')
            if category_link:
                category_name = category_link.get_text(strip=True)
            else:
                continue
        else:
            continue

        # Now find the articles under this category
        article_items = category_section.find_all('li', class_='wsp-post')
        
        for article_item in article_items:
            article_link = article_item.find('a')
            if article_link:
                article_url = article_link['href']
                article_title = article_link.get_text(strip=True)
                
                # Add the article details to the list
                articles.append({
                    'category': category_name,
                    'title': article_title,
                    'url': article_url
                })

    return articles


## Fetches and cleans the content of an article from the given URL

In [3]:
def extract_article_info(article_url):
    response = requests.get(article_url)
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content div
    content_div = soup.find('div', class_='entry-content')
    # If the content div is not found, return None
    if not content_div:
        print("No content found with the class 'entry-content'.")
        return None

    # Remove <div id="ez-toc-container"...>
    toc_container = content_div.find('div', id='ez-toc-container')
    if toc_container:
        toc_container.decompose()

    # Remove all <div> elements where the class contains "heateor"
    for heateor_div in content_div.find_all('div', class_=lambda x: x and 'heateor' in x):
        heateor_div.decompose()

    # Remove the specific <div class="elementor-column elementor-col-33..."> within the <div class="elementor-container elementor-column-gap-default">
    for container_div in content_div.find_all('div', class_=lambda x: x and 'elementor-column elementor-col-33' in x):
        container_div.decompose()

    # Extract the cleaned text content
    content = content_div.get_text(separator="\n").strip()
    content = re.sub(r'\n+', ' ', content)
    content = content.replace('\t', '')
    content = content.replace(u'\xa0', ' ')
    content = content.strip()
    content = re.sub(r'\s+', ' ', content)
    
    cleaned_content = content
    
    # Find the footer where the tags are located
    footer = soup.find('footer', class_='entry-footer')
    tags = ''
    if footer:
        # Extract tags from the second 'span' inside 'footer'
        tags_span = footer.find_all('span', class_='entry-meta')
        if len(tags_span) > 1:
            # Second 'span' should contain the tags
            tags_links = tags_span[1].find_all('a', rel='tag')
            tags = ', '.join(tag.get_text() for tag in tags_links)

    # Return article info as a dictionary
    article_info = {
        'text': cleaned_content,
        'tags': tags
    }

    return article_info

In [4]:
extract_article_info("https://arsonor.com/comment-eduquer-loreille-a-lart-du-mixage-part-3/")['tags']

'écoute critique, espace sonore, mono, pan, profondeur, psycho-acoustique, reverb, stéréo'

## Chunks processing: first method

In [5]:
def chunk_text(text, chunk_size=240, overlap_size=20):
    words = text.split()
    chunks = []
    start = 0
    text_length = len(words)

    while start < text_length:
        # Determine the end index for the chunk
        end = start + chunk_size
        
        # Append the chunk to the list
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        
        # Move the start index forward by chunk_size - overlap_size
        start += chunk_size - overlap_size
        
        # If the end is beyond the text length, break the loop
        if end >= text_length:
            break

    return chunks

## Chunks processing: second method (increased size and dynamic)

In [78]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from typing import List

# Uncomment the following line to download NLTK data if not already present
# nltk.download('punkt')

def chunk_dynamictext(text: str, chunk_size: int = 300, overlap_size: int = 50) -> List[str]:
    # Initialize the tokenizer
    tokenizer = PunktSentenceTokenizer()
    
    # Tokenize the text into sentences
    sentences = tokenizer.tokenize(text)
    
    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for sentence in sentences:
        sentence_words = sentence.split()
        sentence_size = len(sentence_words)

        # If adding the next sentence exceeds chunk size and we have a non-empty chunk
        if current_chunk_size + sentence_size > chunk_size and current_chunk:
            # Add the current chunk to the list of chunks
            chunks.append(" ".join(current_chunk))
            
            # Find the overlap point
            overlap_point = max(0, current_chunk_size - overlap_size)
            
            # Find the sentence that contains the overlap point
            overlap_sentence_index = 0
            word_count = 0
            for i, sent in enumerate(current_chunk):
                word_count += len(sent.split())
                if word_count > overlap_point:
                    overlap_sentence_index = i
                    break
            
            # Start a new chunk from the overlap sentence
            current_chunk = current_chunk[overlap_sentence_index:]
            current_chunk_size = sum(len(sent.split()) for sent in current_chunk)

        # Add the current sentence to the chunk
        current_chunk.append(sentence)
        current_chunk_size += sentence_size

    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

## Creates the json file with the addition of a unique Id for each article and chunks

In [85]:
def create_json_from_sitemap(sitemap_url, output_file, chunk_size=350, overlap_size=30):
 
    articles = extract_articles_from_sitemap(sitemap_url)

    if not articles:
        print("No articles found.")
        return

    all_chunks_data = []

    for article in tqdm(articles):
        article_content = extract_article_info(article['url'])

        if article_content:
            combined = f"{article['title']}-{article_content['text'][:10]}"
            hash_object = hashlib.md5(combined.encode())
            hash_hex = hash_object.hexdigest()
            article_id = hash_hex[:8]

            chunks = chunk_dynamictext(article_content['text'], chunk_size=chunk_size, overlap_size=overlap_size)

            # Create a chunk entry for each chunk of the article
            for i, chunk in enumerate(chunks):
                chunk_id = f"{article_id}-{i+1}"  # e.g., article_id-1, article_id-2, etc.
                all_chunks_data.append({
                    'article_id': article_id,
                    'url': article['url'],
                    'title': article['title'],
                    'category': article['category'],
                    'tags': article_content['tags'],
                    'chunk_id': chunk_id,
                    'chunk_text': chunk
                })
            
        else:
            print(f"Skipping article: {article['title']} due to missing content.")
    
    # Save the result to a JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(all_chunks_data, json_file, ensure_ascii=False, indent=4)

    print(f"Data saved to {output_file}")

In [86]:
sitemap_url = 'https://arsonor.com/plan-du-site/'
output_file = 'arsonor_chunks_improved.json'

create_json_from_sitemap(sitemap_url, output_file)

  0%|          | 0/58 [00:00<?, ?it/s]

Data saved to arsonor_chunks_improved.json


In [88]:
with open('../data/arsonor_chunks_350_30.json', 'r', encoding='utf-8') as file:
    documents = json.load(file)

In [32]:
# print(json.dumps(documents, indent=4, ensure_ascii=False))

### Id unicity checking

In [89]:
from collections import defaultdict

hashes = defaultdict(list)
for doc in documents:
    doc_id = doc['chunk_id']
    hashes[doc_id].append(doc)
len(hashes), len(documents)

(443, 443)

In [91]:
import pandas as pd
json_file_path = '../data/arsonor_chunks_350_30.json'
df = pd.read_json(json_file_path)
df[df['article_id'] == 'f0da0852']

Unnamed: 0,article_id,url,title,category,tags,chunk_id,chunk_text
429,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-1,Le design sonore (ou plus communément appelé s...
430,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-2,Ces derniers ont d’ailleurs souvent pour vocat...
431,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-3,"Car ce qui sépare Live d’une autre DAW, outre ..."
432,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-4,"Eric Persing, le boss de Spectrasonics, a su e..."
433,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-5,Car même si cela peut sembler destiné à des so...
434,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-6,"Et la particularité de Kontakt, c’est que nomb..."
435,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-7,"Et si cela t’effraies toujours, tu peux simple..."
436,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-8,Tu utilises leurs algorithmes associés dans to...
437,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-9,"De plus, son éditeur de son (dans la version c..."
438,f0da0852,https://arsonor.com/10-logiciels-incontournabl...,10 logiciels incontournables pour le sound design,LE SOUND DESIGN,"plugin, sound design",f0da0852-10,"En quelques années, l’ensemble de plugins RX d..."
