In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

In [2]:
DRIVER_PATH = './chromedriver' 
options = Options()
options.add_argument('--headless')

In [3]:
def find_attributes_from_article(url):
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
    driver.get(url)
    # Click on the 'Annuler' button that discards the tutorial
    driver.find_element_by_xpath('//button[contains(text(), "Annuler")]').click()
    # Wait for the text to be fully loaded
    time.sleep(2)
    # Html of the full page
    html = driver.page_source
    page_soup = BeautifulSoup(html, 'html.parser')
    header = page_soup.find_all("title")[0].text.replace("— e-newspaperarchives.ch", "")
    date = ' '.join(header.split()[-3:])
    journal = ' '.join(header.split()[:-3])
    # Find div where the OCRed text is
    article = page_soup.find_all("div", {"id": "documentdisplayleftpanesectiontextcontainer"})
    article_soup = BeautifulSoup(str(article[0]), 'html.parser')
    # Get the text
    t = article_soup.find_all("p")
    # Remove html tags
    text = ""
    for p in t:
        p = str(p).replace('<p>', ' ').replace('</p>', ' ').replace('<b class="highlightcolor">', ' ').replace('</b>', ' ')
        text += p
    return text, journal, date

In [4]:
def scrap_20_article(url_search):
    
    page = requests.get(url_search)
    soup = BeautifulSoup(page.content, 'html.parser')
    raw_list = soup.find_all("div", {"class": "vlistentrymaincell"})
    raw_list_soup = BeautifulSoup(str(raw_list), 'html.parser')
    links = raw_list_soup.find_all('a', href=True)

    rows_list = []
    for l in links:
            dict1 = {}
            title = l.text.replace("[ARTICLE]", "").replace('[ARTICLE+ILLUSTRATION]', '')
            url = "https://www.e-newspaperarchives.ch" + l['href']
            text, journal, date = find_attributes_from_article(url)
            dict1.update({'Article Title': title, 'Journal': journal, 'Date': date, 'Url': url, 'Text': text}) 
            rows_list.append(dict1)

    article_df = pd.DataFrame(rows_list)     
    return article_df

In [5]:
def find_all_articles():
    complete_df = pd.DataFrame()
    for i in tqdm(range(1, 143)): 
        url_search = "https://www.e-newspaperarchives.ch/?a=q&r="+ str(20*i + 1) +"&results=1&e=-------fr-20--41--img-txIN-écologie-ARTICLE------0-----"
        twenty_article_df = scrap_20_article(url_search)
        complete_df = complete_df.append(twenty_article_df)
    return complete_df

In [6]:
complete_df = find_all_articles()

100%|██████████| 142/142 [4:57:00<00:00, 125.50s/it]  


In [7]:
complete_df.to_csv('articles.csv')

In [8]:
len(complete_df.index)

2810

In [14]:
complete_df.sample(10)

Unnamed: 0,Article Title,Journal,Date,Url,Text
13,Agriculture et environnement à la fois s...,La Gazette,14 février 1985,https://www.e-newspaperarchives.ch/?a=d&d=GDM1...,Agriculture et environnement à la fois source...
12,"COMMENT REAGIR? : LA BERCE, L'AMBROISIE ...",Le Nouvelliste,11 juin 2005,https://www.e-newspaperarchives.ch/?a=d&d=NVE2...,"COMMENT REAGIR ? : LA BERCE , LAMBROISIE ET L..."
9,"Trochulus piccardi, un nouvel escargot d...",La Liberté,18 avril 2006,https://www.e-newspaperarchives.ch/?a=d&d=LLE2...,"Trochulus piccardi , un nouvel escargot décou..."
13,«Quelle place la société fait-elle aux j...,La Liberté,7 mars 1983,https://www.e-newspaperarchives.ch/?a=d&d=LLE1...,« Quelle place la société fait-elle aux jeune...
14,unis,Le Peuple valaisan,6 mai 2005,https://www.e-newspaperarchives.ch/?a=d&d=PEV2...,"unis verts « L écologie , ca ne coûte pas..."
15,L'agriculture et l'élevage suisse poussé...,Le Confédéré,15 février 2008,https://www.e-newspaperarchives.ch/?a=d&d=LCE2...,Lagriculture et lélevage suisse poussé la Sui...
15,ENTRETIEN Il y a huit heures de barre de...,La Liberté,10 mai 1997,https://www.e-newspaperarchives.ch/?a=d&d=LLE1...,ENTRETIEN Il y a huit heures de barre derrièr...
7,Un projet signe Monthey et Ciba-Geigy,Le Nouvelliste,29 juin 1988,https://www.e-newspaperarchives.ch/?a=d&d=NVE1...,Un projet signe Monthey et Ciba-Geigy Motion...
16,A l'image ¦ i aes récentes,Le Nouvelliste,17 mars 2001,https://www.e-newspaperarchives.ch/?a=d&d=NVE2...,A limage ¦ i aes récentes f 1 &lt; * Z ^ y &...
14,VOS LETTRES L'écologie et ses aberration...,La Liberté,4 août 1997,https://www.e-newspaperarchives.ch/?a=d&d=LLE1...,VOS LETTRES Lécologie et ses aberrations Ce ...
