In [17]:
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm.auto import tqdm
import hashlib

## Extracts articles url, title and category from the sitemap page

In [12]:
def extract_articles_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)

    if response.status_code != 200:
        print(f"Failed to retrieve the sitemap. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # List to store the extracted articles
    articles = []

    # Find all categories (strong with class 'wsp-category-title') and their corresponding articles
    for category_section in soup.find_all('ul', class_='wsp-posts-list'):
        category_title_tag = category_section.find_previous('strong', class_='wsp-category-title')
        
        if category_title_tag:
            category_link = category_title_tag.find('a')
            if category_link:
                category_name = category_link.get_text(strip=True)
            else:
                continue
        else:
            continue

        # Now find the articles under this category
        article_items = category_section.find_all('li', class_='wsp-post')
        
        for article_item in article_items:
            article_link = article_item.find('a')
            if article_link:
                article_url = article_link['href']
                article_title = article_link.get_text(strip=True)
                
                # Add the article details to the list
                articles.append({
                    'category': category_name,
                    'title': article_title,
                    'url': article_url
                })

    return articles


## Fetches and cleans the content of an article from the given URL

In [16]:
def extract_article_info(article_url):
    response = requests.get(article_url)
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content div
    content_div = soup.find('div', class_='entry-content')
    # If the content div is not found, return None
    if not content_div:
        print("No content found with the class 'entry-content'.")
        return None

    # Remove <div id="ez-toc-container"...>
    toc_container = content_div.find('div', id='ez-toc-container')
    if toc_container:
        toc_container.decompose()

    # Remove all <div> elements where the class contains "heateor"
    for heateor_div in content_div.find_all('div', class_=lambda x: x and 'heateor' in x):
        heateor_div.decompose()

    # Remove the specific <div class="elementor-column elementor-col-33..."> within the <div class="elementor-container elementor-column-gap-default">
    for container_div in content_div.find_all('div', class_=lambda x: x and 'elementor-column elementor-col-33' in x):
        container_div.decompose()

    # Extract the cleaned text content
    content = content_div.get_text(separator="\n").strip()
    content = re.sub(r'\n+', ' ', content)
    content = content.replace('\t', '')
    content = content.replace(u'\xa0', ' ')
    content = content.strip()
    content = re.sub(r'\s+', ' ', content)
    
    cleaned_content = content
    
    # Find the footer where the tags are located
    footer = soup.find('footer', class_='entry-footer')
    tags = ''
    if footer:
        # Extract tags from the second 'span' inside 'footer'
        tags_span = footer.find_all('span', class_='entry-meta')
        if len(tags_span) > 1:
            # Second 'span' should contain the tags
            tags_links = tags_span[1].find_all('a', rel='tag')
            tags = ', '.join(tag.get_text() for tag in tags_links)

    # Return article info as a dictionary
    article_info = {
        'text': cleaned_content,
        'tags': tags
    }

    return article_info

In [11]:
extract_article_info("https://arsonor.com/comment-eduquer-loreille-a-lart-du-mixage-part-3/")['tags']

'écoute critique, espace sonore, mono, pan, profondeur, psycho-acoustique, reverb, stéréo'

## Divides text in different chunks

In [35]:
def chunk_text(text, chunk_size=240, overlap_size=20):
    words = text.split()
    chunks = []
    start = 0
    text_length = len(words)

    while start < text_length:
        # Determine the end index for the chunk
        end = start + chunk_size
        
        # Append the chunk to the list
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        
        # Move the start index forward by chunk_size - overlap_size
        start += chunk_size - overlap_size
        
        # If the end is beyond the text length, break the loop
        if end >= text_length:
            break

    return chunks

## Creates the json file with the addition of a unique Id for each article and chunks

In [39]:
def create_json_from_sitemap(sitemap_url, output_file, chunk_size=240, overlap_size=20):
 
    articles = extract_articles_from_sitemap(sitemap_url)

    if not articles:
        print("No articles found.")
        return

    all_chunks_data = []

    for article in tqdm(articles):
        article_content = extract_article_info(article['url'])

        if article_content:
            combined = f"{article['title']}-{article_content['text'][:10]}"
            hash_object = hashlib.md5(combined.encode())
            hash_hex = hash_object.hexdigest()
            article_id = hash_hex[:8]

            chunks = chunk_text(article_content['text'], chunk_size=chunk_size, overlap_size=overlap_size)

            # Create a chunk entry for each chunk of the article
            for i, chunk in enumerate(chunks):
                chunk_id = f"{article_id}-{i+1}"  # e.g., article_id-1, article_id-2, etc.
                all_chunks_data.append({
                    'article_id': article_id,
                    'title': article['title'],
                    'category': article['category'],
                    'tags': article_content['tags'],
                    'chunk_id': chunk_id,
                    'chunk_text': chunk
                })
            
        else:
            print(f"Skipping article: {article['title']} due to missing content.")
    
    # Save the result to a JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(all_chunks_data, json_file, ensure_ascii=False, indent=4)

    print(f"Data saved to {output_file}")

In [40]:
sitemap_url = 'https://arsonor.com/plan-du-site/'
output_file = 'arsonor_chunks_id.json'

create_json_from_sitemap(sitemap_url, output_file)

  0%|          | 0/58 [00:00<?, ?it/s]

Data saved to arsonor_chunks_id.json


In [41]:
with open('../data/arsonor_chunks_id.json', 'r', encoding='utf-8') as file:
    documents = json.load(file)

In [32]:
# print(json.dumps(documents, indent=4, ensure_ascii=False))

### Id unicity checking

In [42]:
from collections import defaultdict

hashes = defaultdict(list)
for doc in documents:
    doc_id = doc['chunk_id']
    hashes[doc_id].append(doc)
len(hashes), len(documents)

(589, 589)

In [54]:
import pandas as pd
json_file_path = '../data/arsonor_chunks_id.json'
df = pd.read_json(json_file_path)
df[df['article_id'] == '4615db39']

Unnamed: 0,article_id,title,category,tags,chunk_id,chunk_text
0,4615db39,L’intelligence artificielle (IA) dans le studi...,LA POST-PROD,"collaboration IA, intelligence artificielle, p...",4615db39-1,Suite et fin du tour d’horizon des logiciels e...
1,4615db39,L’intelligence artificielle (IA) dans le studi...,LA POST-PROD,"collaboration IA, intelligence artificielle, p...",4615db39-2,"ce cas, les prochaines évolutions IA sont de v..."
2,4615db39,L’intelligence artificielle (IA) dans le studi...,LA POST-PROD,"collaboration IA, intelligence artificielle, p...",4615db39-3,ans! Il n’y a pas d’autre choix que d’ embrass...
3,4615db39,L’intelligence artificielle (IA) dans le studi...,LA POST-PROD,"collaboration IA, intelligence artificielle, p...",4615db39-4,outils utilisant l’intelligence artificielle s...
4,4615db39,L’intelligence artificielle (IA) dans le studi...,LA POST-PROD,"collaboration IA, intelligence artificielle, p...",4615db39-5,les outils IA de traitement du langage naturel...
