In [29]:
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm.auto import tqdm

In [38]:
# Fetches and cleans the content of an article from the given URL
def extract_article_info(article_url):
    response = requests.get(article_url)
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content div
    content_div = soup.find('div', class_='entry-content')
    # If the content div is not found, return None
    if not content_div:
        print("No content found with the class 'entry-content'.")
        return None

    # Remove <div id="ez-toc-container"...>
    toc_container = content_div.find('div', id='ez-toc-container')
    if toc_container:
        toc_container.decompose()

    # Remove all <div> elements where the class contains "heateor"
    for heateor_div in content_div.find_all('div', class_=lambda x: x and 'heateor' in x):
        heateor_div.decompose()

    # Remove the specific <div class="elementor-column elementor-col-33..."> within the <div class="elementor-container elementor-column-gap-default">
    for container_div in content_div.find_all('div', class_=lambda x: x and 'elementor-column elementor-col-33' in x):
        container_div.decompose()

    # Extract the cleaned text content
    content = content_div.get_text(separator="\n").strip()
    content = re.sub(r'\n+', ' ', content)
    content = content.replace('\t', '')
    content = content.replace(u'\xa0', ' ')
    content = content.strip()
    content = re.sub(r'\s+', ' ', content)
    
    cleaned_content = content
    
    # Find the footer where the tags are located
    footer = soup.find('footer', class_='entry-footer')
    tags = []
    if footer:
        # Extract tags from the second 'span' inside 'footer'
        tags_span = footer.find_all('span', class_='entry-meta')
        if len(tags_span) > 1:
            # Second 'span' should contain the tags
            tags_links = tags_span[1].find_all('a', rel='tag')
            tags = [tag.get_text() for tag in tags_links]

    # Return article info as a dictionary
    article_info = {
        'text': cleaned_content,
        'tags': tags
    }

    return article_info

In [40]:
extract_article_info("https://arsonor.com/comment-eduquer-loreille-a-lart-du-mixage-part-3/")['tags']

['écoute critique',
 'espace sonore',
 'mono',
 'pan',
 'profondeur',
 'psycho-acoustique',
 'reverb',
 'stéréo']

In [11]:
def extract_articles_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)

    if response.status_code != 200:
        print(f"Failed to retrieve the sitemap. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # List to store the extracted articles
    articles = []

    # Find all categories (strong with class 'wsp-category-title') and their corresponding articles
    for category_section in soup.find_all('ul', class_='wsp-posts-list'):
        category_title_tag = category_section.find_previous('strong', class_='wsp-category-title')
        
        if category_title_tag:
            category_link = category_title_tag.find('a')
            if category_link:
                category_name = category_link.get_text(strip=True)
            else:
                continue
        else:
            continue

        # Now find the articles under this category
        article_items = category_section.find_all('li', class_='wsp-post')
        
        for article_item in article_items:
            article_link = article_item.find('a')
            if article_link:
                article_url = article_link['href']
                article_title = article_link.get_text(strip=True)
                
                # Add the article details to the list
                articles.append({
                    'category': category_name,
                    'title': article_title,
                    'url': article_url
                })

    return articles


In [41]:
def create_json_from_sitemap(sitemap_url, output_file):
 
    articles = extract_articles_from_sitemap(sitemap_url)

    if not articles:
        print("No articles found.")
        return

    articles_data = []

    for article in tqdm(articles):
        article_content = extract_article_info(article['url'])

        if article_content:
            articles_data.append({
                'title': article['title'],
                'category': article['category'],
                'text': article_content['text'],
                'tags': article_content['tags']
            })
        else:
            print(f"Skipping article: {article['title']} due to missing content.")

    # Save the result to a JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(articles_data, json_file, ensure_ascii=False, indent=4)

    print(f"Data saved to {output_file}")

In [42]:
sitemap_url = 'https://arsonor.com/plan-du-site/'
output_file = 'arsonor_data.json'

create_json_from_sitemap(sitemap_url, output_file)

100%|██████████| 58/58 [01:39<00:00,  1.72s/it]


Data saved to arsonor_data.json


In [43]:
with open('arsonor_data.json', 'r', encoding='utf-8') as file:
    documents = json.load(file)

In [83]:
documents[0]
# print(json.dumps(documents, indent=4, ensure_ascii=False))

{'title': 'L’intelligence artificielle (IA) dans le studio de production audio (6/6)',
 'category': 'LA POST-PROD',
 'text': "Suite et fin du tour d’horizon des logiciels et plug-ins IA dans la post-production audio et leur impact sur l’activité de ce secteur. Quel est l’impact de l’IA à venir dans la post-production audio et musicale? Les nouveaux outils et plug-ins audio sur le marché montrent clairement cette tendance: des interfaces épurées pour une utilisation simplifiée à l’extrême, et en même temps une capacité à résoudre efficacement des « problèmes » audio complexes. Ainsi, tout un chacun est de plus en plus capable d’obtenir un résultat audio professionnel sans avoir forcément fait d’études en ingénierie du son ou même sans une énorme expérience. En fait, cette tendance ne date pas d’hier. L’ère du home-studio , du « Do It Yourself » et des outils audio pro démocratisés au plus grand nombre d’amateurs en herbe a fait son chemin. La technologie IA ne fait qu’accélérer le proce