# Projet de lois 
Les projets de lois sont déposées par le gouvernement 

URL type des projets de lois 
https://www.assemblee-nationale.fr/dyn/15/textes/l15b4324_projet-loi#

In [572]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B4324.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
name = soup.find('style="font-variant:small-caps"', text='au nom de M. Jean CASTEX')
if name:
    next_sibling = name.find_next_sibling()
    if next_sibling:
        name = next_sibling.text
        print(name)
    else:
        print("Next sibling element not found")
else:
    print("Element not found")





Element not found


## Scraping des projets de lois


URL type 

https://www.assemblee-nationale.fr/dyn/15/textes/l15bXXXX_projet-loi#

### 1. Récupération des numéro de projets de lois, date et exposée des motifs

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm

class TextInformation:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url)
        self.soup = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
    
    def get_information(self):
        information = self.soup.find('span', {'style': 'vertical-align:3pt'})

        if information:
            next_sibling = information.next_sibling
            try:
                return next_sibling.strip()
            except AttributeError:
                return "No next sibling found"
        else:
            return "Element not found"

    
    def get_date(self):
        date = self.soup.find('p', {'class': 'assnatenregistr'})

        if date:
            date_text = date.text.strip()
            words = date_text.split(" ")
            day = words[-3]
            month = words[-2]
            year = words[-1]
            return f"{day} {month} {year}"
        else:
            return "Element not found"
    
    def get_texts(self):
        texts = self.soup.find_all('p', {'class': 'assnatLoiTexte'})
        if texts:
            return [text.text.strip() for text in texts]
        else:
            return "Elements not found"
            
def clean_data(input_file, output_file):
    # lecture des données
    data = pd.read_csv(input_file)
    to_analz = data.replace(regex=[r'\\xa0'], value=' ')
    
    # suppression du fichier existant s'il existe
    if os.path.exists(output_file):
        os.remove(output_file)

    # enregistrement des données nettoyées dans un nouveau fichier csv
    to_analz.to_csv(output_file, index=False, encoding='utf-8-sig')

def main(urls):
    start_time = time.time()
    result = []
    with tqdm(total=len(urls), bar_format='{bar}') as pbar:
        for url in urls:
            pbar.update(1)
            print("Processing: ", url)
            text_info = TextInformation(url)
            information = text_info.get_information()
            if information == "Element not found":
                continue
            date = text_info.get_date()
            texts = text_info.get_texts()
            result.append([information, date, texts])

    df = pd.DataFrame(result, columns=['Information', 'Date', 'Texts'])
    df = df.replace(regex=[r'\\xa0'], value=' ')
    
    # suppression du fichier existant s'il existe
    if os.path.exists('raw_data.csv'):
        os.remove('raw_data.csv')

    # enregistrement des données dans un fichier csv
    df.to_csv('raw_data.csv', index=False, encoding='utf-8-sig')
    
    # nettoyage des données et enregistrement dans un nouveau fichier csv
    clean_data('raw_data.csv', 'to_analz.csv')
    
    elapsed_time = time.time() - start_time
    elapsed_time_minutes = round(elapsed_time / 60, 1)
    print("--- %s minutes ---" % (elapsed_time_minutes))



urls = [f"https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B{i:04d}.html" for i in range(0, 5200) if i != 3651]
main(urls)



### 2. Récupération des articles

In [28]:
# Importer les bibliothèques nécessaires
from bs4 import BeautifulSoup
import requests

# Obtenir le contenu HTML de la page web

response = requests.get(url)
html_content = response.text

# Analyser le contenu HTML avec BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Trouver la balise <div class="assnatSection3">
section3 = soup.find('div', {'class': 'assnatSection3'})

# Récupérer tout le texte contenu dans la balise
texte = section3.text

# Afficher le texte récupéré
print(texte)





– 1 –



projet de loi


				Le Premier ministre,
			

				Sur le rapport de la ministre des solidarités et de la santé,
			

				Vu l’article 39 de la Constitution,
			

				Décrète :
			

Le présent projet de loi ratifiant l’ordonnance n° 2017‑484 du 6 avril 2017 relative à la création d’organismes dédiés à l’exercice de l’activité de retraite professionnelle supplémentaire et à l’adaptation des régimes de retraite supplémentaire en unités de rente, délibéré en conseil des ministres après avis du Conseil d’État, sera présenté à l’Assemblée nationale par le ministre de l’économie et des finances, qui sera chargé d’en exposer les motifs et d’en soutenir la discussion.
			




Article 1er


				L’ordonnance n° 2017‑484 du 6 avril 2017 relative à la création d’organismes dédiés à l’exercice de l’activité de retraite professionnelle supplémentaire et à l’adaptation des régimes de retraite supplémentaire en unités de rente est ratifiée.
			

Article 2


Le chapitre III du titre II du li

In [53]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm


class TextInformation:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url)
        self.soup = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
    
    def get_information(self):
        """Get the information from the webpage"""
        try:
            soup = self.get_soup()
            information = soup.find("div", class_="titre")
            next_sibling = information.next_sibling
            while next_sibling is not None:
                if next_sibling.name == "h3":
                    break
                next_sibling = next_sibling.next_sibling
            if next_sibling is not None:
                try:
                    return next_sibling.text.strip()
                except AttributeError:
                    pass
            else:
                print("Error: next sibling is None")
        except:
            print(f"Error getting information from {self.url}")



    def get_date(self):
        date = self.soup.find('p', {'class': 'assnatenregistr'})

        if date:
            date_text = date.text.strip()
            words = date_text.split(" ")
            day = words[-3]
            month = words[-2]
            year = words[-1]
            return f"{day} {month} {year}"
        return None

    def get_texts(self):
        texts = self.soup.find_all('p', {'class': 'assnatLoiTexte'})
        if texts:
            return [text.text.strip() for text in texts]
        return None


def clean_data(input_file, output_file):
    # lecture des données
    data = pd.read_csv(input_file)
    to_analz_2 = data.replace(regex=[r'\\xa0'], value=' ')

    # suppression du fichier existant s'il existe
    if os.path.exists(output_file):
        os.remove(output_file)

    # enregistrement des données nettoyées dans un nouveau fichier csv
    to_analz_2.to_csv(output_file, index=False, encoding='utf-8-sig')


def main(urls):
    start_time = time.time()
    result = []
    with tqdm(total=len(urls), bar_format='{bar}') as pbar:
        for url in urls:
            pbar.update(1)
            print("Processing: ", url)
            text_info = TextInformation(url)
            information = text_info.get_information()
            date = text_info.get_date()
            texts = text_info.get_texts()
            if information is not None and date is not None and texts is not None:
                result.append([information, date, texts])
            else:
                continue

    df = pd.DataFrame(result, columns=['Information', 'Date', 'expose_lois'])
    df = df.replace(regex=[r'\\xa0'], value=' ')

    # suppression du fichier existant s'il existe
    if os.path.exists('raw_data_2.csv'):
        os.remove('raw_data_2.csv')

    # enregistrement des données dans un fichier csv
    df.to_csv('raw_data_2.csv', index=False, encoding='utf-8-sig')

    # nettoyage des données et enregistrement dans un nouveau fichier csv
    clean_data('raw_data_2.csv', 'to_analz_2.csv')

    elapsed_time = time.time() - start_time
    elapsed_time_minutes = round(elapsed_time / 60, 1)
    print("--- %s minutes ---" % (elapsed_time_minutes))


urls = [f"https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B{i:04d}.html" for i in range(3735, 3740)]
main(urls)


████      

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3735.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3735.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3736.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3736.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3737.html


████████  

Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3737.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3738.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3738.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3739.html


██████████

Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B3739.html
--- 0.0 minutes ---





In [16]:
import pandas as pd
df = pd.read_csv('/Users/camille/repo/Hetic/projet_demo/ia_theme_French National Assembly/scraping/cleaned_data.csv')
df = df.replace(regex=[r'\"\[\''], value='')
df = df.replace(regex=[r'\"'], value='')

In [56]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm


class TextInformation:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url)
        self.soup = BeautifulSoup(
            response.text,
            'html.parser',
            from_encoding='utf-8')

    def get_information(self):
        """Get the information from the webpage"""
        try:
            soup = self.soup
            # On a déjà stocké self.soup dans l'initialisation, on peut donc le réutiliser directement
            information = self.soup.find('span', {'style': 'vertical-align:3pt'})

            next_sibling = information.next_sibling
            if next_sibling is not None:
                try:
                    return next_sibling.text.strip()
                except AttributeError:
                    pass
            else:
                print("Error: next sibling is None")
        except BaseException:
            print(f"Error getting information from {self.url}")

    def get_date(self):
        date = self.soup.find('p', {'class': 'assnatenregistr'})

        if date:
            date_text = date.text.strip()
            words = date_text.split(" ")
            day = words[-3]
            month = words[-2]
            year = words[-1]
            return f"{day} {month} {year}"
        return None

    def get_texts(self):
        texts = self.soup.find_all('p', {'class': 'assnatLoiTexte'})
        if texts:
            return [text.text.strip() for text in texts]
        return None
    
    def get_helo(self):
        section3 = self.soup.find('div', {'class': 'assnatSection3'})
        if section3:
            return   [text.text.strip() for text in section3]
        return None
    
    def clean_data(self, input_file, output_file):
        # lecture des données
        data = pd.read_csv(input_file)
        to_analz_2 = data.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # suppression du fichier existant s'il existe
        if os.path.exists(output_file):
            os.remove(output_file)

        # appliquer la même transformation sur to_analz_2
        to_analz_2 = to_analz_2.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # enregistrement des données nettoyées dans un nouveau fichier csv
        to_analz_2.to_csv(output_file, index=False, encoding='utf-8-sig')
        
    def main(self):
        start_time = time.time()
        result = []
        for url in tqdm([self.url]):
            print("Processing: ", url)
            text_info = TextInformation(url)
            information = text_info.get_information()
            date = text_info.get_date()
            texts = text_info.get_texts()
            helo = text_info.get_helo()
            if information is not None and date is not None and texts is not None and helo is not None:
                result.append([information, date, texts, helo])
            else:
                continue

        df = pd.DataFrame(result, columns=['Information', 'Date', 'exposee', 'texte_lois'])
        df


100%|██████████| 1/1 [00:00<00:00,  8.80it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0003.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0003.html
Processing time: 0.12 seconds





In [57]:
urls = [    'https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0003.html',    'https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0004.html',    'https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0005.html']

text_info = TextInformation(urls[0])
text_info.main(urls)


 33%|███▎      | 1/3 [00:00<00:00,  6.42it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0003.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0003.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0004.html


100%|██████████| 3/3 [00:00<00:00,  5.39it/s]

Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0004.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0005.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJANR5L15B0005.html
Processing time: 0.56 seconds





In [52]:
urls =  'https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0003.html'

In [66]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

class TextInformation:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url)
        self.soup = BeautifulSoup(
            response.text,
            'html.parser',
            from_encoding='utf-8')

    def get_information(self):
        """Get the information from the webpage"""
        try:
            soup = self.soup
            # On a déjà stocké self.soup dans l'initialisation, on peut donc le réutiliser directement
            information = self.soup.find('span', {'style': 'vertical-align:3pt'})

            next_sibling = information.next_sibling
            if next_sibling is not None:
                try:
                    return next_sibling.text.strip()
                except AttributeError:
                    pass
            else:
                print("Error: next sibling is None")
        except BaseException:
            print(f"Error getting information from {self.url}")

    def get_date(self):
        date = self.soup.find('p', {'class': 'assnatenregistr'})

        if date:
            date_text = date.text.strip()
            words = date_text.split(" ")
            day = words[-3]
            month = words[-2]
            year = words[-1]
            return f"{day} {month} {year}"
        return None

    def get_texts(self):
        texts = self.soup.find_all('p', {'class': 'assnatLoiTexte'})
        if texts:
            return [text.text.strip() for text in texts]
        return None
    
    def get_helo(self):
        section3 = self.soup.find('div', {'class': 'assnatSection3'})
        if section3:
            return   [text.text.strip() for text in section3]
        return None
    
class DataCleaner:
    def __init__(self, input_file, output_file):
        self.input_file = input_file
        self.output_file = output_file
        
    def clean_data(self):
        # lecture des données
        data = pd.read_csv(self.input_file)
        to_analz_2 = data.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # suppression du fichier existant s'il existe
        if os.path.exists(self.output_file):
            os.remove(self.output_file)

        # appliquer la même transformation sur to_analz_2
        to_analz_2 = to_analz_2.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # enregistrement des données nettoyées dans un nouveau fichier csv
        to_analz_2.to_csv(self.output_file, index=False, encoding='utf-8-sig')

class WebScraper:
    def __init__(self, urls):
        self.urls = urls
        self.result = []
        
    def scrape(self):
        for url in tqdm(self.urls):
            print("Processing: ", url)
            text_info = TextInformation(url)
            information = text_info.get_information()
            date = text_info.get_date()
            texts = text_info.get_texts()
            helo = text_info.get_helo()
            if information is not None and date is not None and texts is not None and helo is not None:
                df = pd.DataFrame(result, columns=['Information', 'Date', 'exposee', 'texte_lois'])
                df = df.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')
                header = ['url', 'information', 'date', 'texts', 'helo']
                df = pd.DataFrame(scraper.result, columns=header)
                df.to_csv('result.csv', mode='a', index=False, header=False, encoding='utf-8-sig')


urls = [f"https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B{i:04d}.html" for i in range(
    0, 30)]
scraper = WebScraper(urls)
scraper.scrape()


  3%|▎         | 1/30 [00:00<00:03,  8.18it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0000.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0000.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0001.html


 10%|█         | 3/30 [00:00<00:03,  8.77it/s]

Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0001.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0002.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0002.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0003.html


 10%|█         | 3/30 [00:00<00:04,  6.31it/s]


NameError: name 'result' is not defined

In [64]:

urls = [f"https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B{i:04d}.html" for i in range(
    0, 30)]

In [67]:
df = pd.read_csv('/Users/camille/repo/Hetic/projet_demo/ia_theme_French National Assembly/scraping/cleaned_data.csv')
df

Unnamed: 0,Information,Date,exposee,texte_lois
0,3,29 juin 2017.,['L’article 114 de la loi n° 2016‑1691 du 9 dé...,"['', '– 1 –', '', 'projet de loi', '', 'Le Pre..."
1,4,29 juin 2017.,['Conformément à la volonté du Président de la...,"['', '– 1 –', '', 'projet de loi', '', 'Le Pre..."
2,6,29 juin 2017.,"['', 'Le présent projet de loi ratifiant l’ord...","['', 'projet de loi', '', 'Le Premier ministre..."
3,7,29 juin 2017.,['L’article 1er du projet de loi ratifie l’ord...,"['', '– 1 –', '', 'projet de loi', '', 'Le Pre..."
4,8,29 juin 2017.,['L’article unique du projet de loi procède à ...,"['', 'projet de loi', '', 'Le Premier ministre..."
...,...,...,...,...
266,,2 février 2022.,['L’article 74 de la loi n° 2019‑1428 du 24 dé...,"['', 'projet de loi', '', 'Le Premier ministre..."
267,,23 février 2022.,['L’article 55 de la loi n° 2019‑828 du 6 août...,"['', '– 1 –', '', 'projet de loi', '', 'Le Pre..."
268,,7 mars 2022.,['ratifiant l’ordonnance n° 2021‑1200 du 15 se...,"['', 'projet de loi', '', 'Le Premier ministre..."
269,,9 mars 2022.,['L’article 108 de la loi n° 2020‑1576 du 14 d...,"['', 'projet de loi', '', 'Le Premier ministre..."


In [75]:
df.replace(regex=[r'\[\''], value='').replace(regex=[r'\'\]\"'], value='')

Unnamed: 0,Information,Date,exposee,texte_lois
0,3,29 juin 2017.,L’article 114 de la loi n° 2016‑1691 du 9 déce...,"', '– 1 –', '', 'projet de loi', '', 'Le Premi..."
1,4,29 juin 2017.,Conformément à la volonté du Président de la R...,"', '– 1 –', '', 'projet de loi', '', 'Le Premi..."
2,6,29 juin 2017.,"', 'Le présent projet de loi ratifiant l’ordon...","', 'projet de loi', '', 'Le Premier ministre,'..."
3,7,29 juin 2017.,L’article 1er du projet de loi ratifie l’ordon...,"', '– 1 –', '', 'projet de loi', '', 'Le Premi..."
4,8,29 juin 2017.,L’article unique du projet de loi procède à la...,"', 'projet de loi', '', 'Le Premier ministre,'..."
...,...,...,...,...
266,,2 février 2022.,L’article 74 de la loi n° 2019‑1428 du 24 déce...,"', 'projet de loi', '', 'Le Premier ministre,'..."
267,,23 février 2022.,L’article 55 de la loi n° 2019‑828 du 6 août 2...,"', '– 1 –', '', 'projet de loi', '', 'Le Premi..."
268,,7 mars 2022.,ratifiant l’ordonnance n° 2021‑1200 du 15 sept...,"', 'projet de loi', '', 'Le Premier ministre,'..."
269,,9 mars 2022.,L’article 108 de la loi n° 2020‑1576 du 14 déc...,"', 'projet de loi', '', 'Le Premier ministre,'..."


In [76]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

class TextInformation:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url)
        self.soup = BeautifulSoup(
            response.text,
            'html.parser',
            from_encoding='utf-8')

    def get_information(self):
        """Get the information from the webpage"""
        try:
            soup = self.soup
            # On a déjà stocké self.soup dans l'initialisation, on peut donc le réutiliser directement
            information = self.soup.find('span', {'style': 'vertical-align:3pt'})

            next_sibling = information.next_sibling
            if next_sibling is not None:
                try:
                    return next_sibling.text.strip()
                except AttributeError:
                    pass
            else:
                print("Error: next sibling is None")
        except BaseException:
            print(f"Error getting information from {self.url}")

    def get_date(self):
        date = self.soup.find('p', {'class': 'assnatenregistr'})

        if date:
            date_text = date.text.strip()
            words = date_text.split(" ")
            day = words[-3]
            month = words[-2]
            year = words[-1]
            return f"{day} {month} {year}"
        return None

    def get_texts(self):
        texts = self.soup.find_all('p', {'class': 'assnatLoiTexte'})
        if texts:
            return [text.text.strip() for text in texts]
        return None
    
    def get_helo(self):
        section3 = self.soup.find('div', {'class': 'assnatSection3'})
        if section3:
            return   [text.text.strip() for text in section3]
        return None
    
class DataCleaner:
    def __init__(self, input_file, output_file):
        self.input_file = input_file
        self.output_file = output_file
        
    def clean_data(self):
        # lecture des données
        data = pd.read_csv(self.input_file)
        to_analz_2 = data.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # suppression du fichier existant s'il existe
        if os.path.exists(self.output_file):
            os.remove(self.output_file)

        # appliquer la même transformation sur to_analz_2
        to_analz_2 = to_analz_2.replace(regex=[r'\"\[\''], value='').replace(regex=[r'\'\]\"'], value='').replace(regex=[r'\\xa0'], value=' ')

        # enregistrement des données nettoyées dans un nouveau fichier csv
        to_analz_2.to_csv(self.output_file, index=False, encoding='utf-8-sig')

class WebScraper:
    def __init__(self, urls):
        self.urls = urls
        self.result = []
        
    def scrape(self):
        for url in tqdm(self.urls):
            print("Processing: ", url)
            text_info = TextInformation(url)
            information = text_info.get_information()
            date = text_info.get_date()
            texts = text_info.get_texts()
            helo = text_info.get_helo()
            self.result.append({
                'url': url,
                'information': information,
                'date': date,
                'texts': texts,
                'helo': helo
            })
            time.sleep(0.1) # Pause de 0.1 seconde entre chaque requête pour éviter de surcharger le serveur



In [77]:
# create a WebScraper instance with a list of URLs to scrape
scraper = WebScraper(urls)

# call the scrape() method to scrape the URLs
scraper.scrape()


# create a DataFrame from the scraper result
df = pd.DataFrame(scraper.result)

# save the DataFrame to a CSV file
df.to_csv('scraped_data.csv', index=False)
cleaner = DataCleaner('scraped_data.csv', 'cleaned_data.csv')
cleaner.clean_data()




Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0000.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0000.html


  3%|▎         | 1/30 [00:00<00:07,  3.94it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0001.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0001.html


  7%|▋         | 2/30 [00:00<00:06,  4.21it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0002.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0002.html


 13%|█▎        | 4/30 [00:00<00:05,  4.78it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0003.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0004.html


 17%|█▋        | 5/30 [00:01<00:05,  4.46it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0005.html


 20%|██        | 6/30 [00:01<00:08,  2.75it/s]

Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0005.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0006.html


 23%|██▎       | 7/30 [00:01<00:07,  3.20it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0007.html


 27%|██▋       | 8/30 [00:02<00:06,  3.52it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0008.html


 33%|███▎      | 10/30 [00:02<00:04,  4.39it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0009.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0010.html


 37%|███▋      | 11/30 [00:02<00:04,  4.53it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0011.html


 40%|████      | 12/30 [00:02<00:03,  4.58it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0012.html


 47%|████▋     | 14/30 [00:03<00:03,  4.86it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0013.html
Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0014.html


 50%|█████     | 15/30 [00:03<00:03,  4.52it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0015.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0015.html


 53%|█████▎    | 16/30 [00:03<00:03,  4.47it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0016.html


 57%|█████▋    | 17/30 [00:04<00:02,  4.55it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0017.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0017.html


 60%|██████    | 18/30 [00:04<00:02,  4.48it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0018.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0018.html


 63%|██████▎   | 19/30 [00:04<00:02,  4.44it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0019.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0019.html


 67%|██████▋   | 20/30 [00:04<00:02,  4.41it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0020.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0020.html


 70%|███████   | 21/30 [00:04<00:02,  4.38it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0021.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0021.html


 73%|███████▎  | 22/30 [00:05<00:01,  4.18it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0022.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0022.html


 77%|███████▋  | 23/30 [00:05<00:01,  4.28it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0023.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0023.html


 80%|████████  | 24/30 [00:05<00:01,  4.33it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0024.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0024.html


 83%|████████▎ | 25/30 [00:05<00:01,  4.40it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0025.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0025.html


 87%|████████▋ | 26/30 [00:06<00:00,  4.43it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0026.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0026.html


 90%|█████████ | 27/30 [00:06<00:00,  4.41it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0027.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0027.html


 93%|█████████▎| 28/30 [00:06<00:00,  4.41it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0028.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0028.html


 97%|█████████▋| 29/30 [00:06<00:00,  4.42it/s]

Processing:  https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0029.html
Error getting information from https://www.assemblee-nationale.fr/dyn/opendata/PRJLANR5L15B0029.html


100%|██████████| 30/30 [00:07<00:00,  4.24it/s]


In [1]:
cleaner = DataCleaner('scraped_data.csv', 'cleaned_data.csv')
cleaner.clean_data()

NameError: name 'DataCleaner' is not defined

In [4]:
# create a DataFrame from the scraper result
import pandas as pd

df = pd.DataFrame(scraper.result)

# clean the data and save to a new CSV file
cleaner = DataCleaner('scraped_data.csv', 'cleaned_data.csv')
cleaner.clean_data()


ModuleNotFoundError: No module named 'DataCleaner'

In [66]:
import scraper

ModuleNotFoundError: No module named 'scraper'

In [6]:
import scraper


# Define a list of URLs to scrape
my_urls = ['https://example.com/page1', 'https://example.com/page2', 'https://example.com/page3']

# Create a new instance of WebScraper and scrape the data
my_scraper = WebScraper(my_urls)
my_scraper.scrape()

# Access the scraped data
for data in my_scraper.result:
    print(data['url'])
    print(data['information'])
    print(data['date'])
    print(data['texts'])
    print(data['helo'])


ModuleNotFoundError: No module named 'scraper'