In [17]:
import os
import json
import glob

from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

from web_scraping_utils import download_save_html_data, from_url_to_filename
from web_scraping_utils import document_string_from_source, from_filename_to_url
from setup_data import WEBSITES_DIR as DATA_FOLDER
from setup_data import DOCUMENTS_DIR as DOCUMENTS_FOLDER
from setup_data import check_directory

ModuleNotFoundError: No module named 'setup_data'

## Scrap html data from url and save it 
Here we get all links ('<\a>' tags ) from "https://www.total.com/" and we download html content from all those links. 

You should adapt this part according to the structure of the website you want to scrap

In [13]:
# DATA_FOLDER = "html_data_dir/"
# DOCUMENTS_FOLDER = "documents/"
url = "https://www.total.com/"
check_directory(DATA_FOLDER)
check_directory(DOCUMENTS_FOLDER)

In [14]:
data_folder_is_empty = len(glob.glob(DATA_FOLDER+"*")) == 0
if data_folder_is_empty:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    all_a_tags = soup.find_all("a")

    for a_tag in tqdm(all_a_tags): ##this part is specific to each website structure. Please change it accordingly
        href = a_tag.get("href",None)
        if (href is not None):
            if "http" not in href:
                href = "https://total.com" + href
            if "total.com/" in href:
                filename = DATA_FOLDER + from_url_to_filename(href)
                download_save_html_data(href,filename)

## Process downloaded html data and save them as json files
Each json file is a document that our ODQA framework can process. It has 3 key-value pairs : "title", "text" and "url"

In [16]:
NB_SENTENCES_PER_JSON_DOCUMENT = 5 #Number of sentences per json document.

for filepath in tqdm(glob.glob(DATA_FOLDER+"*")):
    json_file_path = os.path.join(DOCUMENTS_FOLDER,os.path.basename(filepath))
    if not os.path.isfile(json_file_path+"_0.json"):
        try:
            doc_text = document_string_from_source(filepath)
            doc_json = {"title": doc_text[:20], "text" : "", "url": from_filename_to_url(os.path.basename(filepath))}
            doc_text_sentences = doc_text.split(". ")
            for i in range(0,len(doc_text_sentences),NB_SENTENCES_PER_JSON_DOCUMENT):
                doc_json["text"] = ". ".join(doc_text_sentences[i:i+NB_SENTENCES_PER_JSON_DOCUMENT])
                sub_doc_json_file_path = json_file_path + "_{}.json".format(i)
                with open(sub_doc_json_file_path,"w+") as json_file:
                    json.dump(doc_json,json_file)
        except UnicodeDecodeError as e:
            print("Error : ",e,json_file_path)
    else:
        print("{} file(s) already exist(s)".format(json_file_path))

  1%|          | 1/136 [00:00<00:21,  6.17it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_latest-news
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_


  3%|▎         | 4/136 [00:00<00:12, 10.82it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_check-out-our-official-channels-social-media
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_industrial-safety_culture
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_waste


  4%|▍         | 6/136 [00:00<00:10, 12.28it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_shares-and-dividends_dividends


  7%|▋         | 10/136 [00:00<00:10, 12.52it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_results-investor-presentations_main-indicators
Reading doc from disk  ../data/ext_data/websites/total_com/www_sustainable-performance_total_com_en
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_environmental-engineering
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers_fuels-and-lubricants


  9%|▉         | 12/136 [00:00<00:08, 13.82it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_industrial-safety
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news_communiques-presse_go-ahead-for-the-northern-lights-project-in-norway
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_infographics_total-100000-people-more-130-countries


 10%|█         | 14/136 [00:01<00:09, 12.39it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/www_acs_total_com_en
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_ship-market_products-services


 13%|█▎        | 18/136 [00:01<00:09, 11.95it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_strength
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_health_workplace
Reading doc from disk  ../data/ext_data/websites/total_com/www_solar_total_com_en
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers


 16%|█▌        | 22/136 [00:01<00:09, 12.23it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_climate-change_low-carbon-electricity
Reading doc from disk  ../data/ext_data/websites/total_com/www_careers_total_com_en_our-commitments_recruitment-your-application-induction


 19%|█▉        | 26/136 [00:02<00:07, 14.13it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_buy-shares
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_shareholder-publications
Reading doc from disk  ../data/ext_data/websites/total_com/www_careers_total_com_fr
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders


 22%|██▏       | 30/136 [00:02<00:07, 14.47it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_results-investor-presentations
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_fr
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_shared-development_local-economic-development
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_questions-contacts
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_publications-and-regulated-information_reports-and-publications


 24%|██▎       | 32/136 [00:02<00:09, 10.64it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_transformation-development_specialty-chemicals_hutchinson-elastomers
Reading doc from disk  ../data/ext_data/websites/total_com/www_total_com_getting-net-zero


 26%|██▋       | 36/136 [00:03<00:09, 10.72it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_shareholders-club
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses_solar-energy
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_exploration-production_committed-future-bioenergies


 28%|██▊       | 38/136 [00:03<00:08, 11.40it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_video_total-carbon-neutrality-businesses
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_projects
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_health_local-communities


 31%|███       | 42/136 [00:03<00:07, 11.80it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_news_electric-mobility-in-paris-total-wins
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_dedicated-team
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_employee-shareholders


 32%|███▏      | 44/136 [00:03<00:08, 10.29it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_en_privacy
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_anti-pollution-measures


 34%|███▍      | 46/136 [00:03<00:08, 10.02it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_contact-form
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_identity
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group


 35%|███▌      | 48/136 [00:04<00:08, 10.12it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_industrial-safety_risk-management


 38%|███▊      | 52/136 [00:04<00:08, 10.08it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_colonne-3_info_calendar
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news_communiques-presse_reduction-of-methane-emissions-with-OGMP-2
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_legal


 40%|███▉      | 54/136 [00:04<00:08,  9.48it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_protecting-biodiversity
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_exploration-production_oil-gas


 41%|████      | 56/136 [00:05<00:09,  8.06it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses_fuels
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_shared-development_social-engineering


 43%|████▎     | 58/136 [00:05<00:09,  7.99it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_results-investor-presentations_results
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_news_ccus-total-and-its-partners-release-next-gen-co2-storage-simulator


 44%|████▍     | 60/136 [00:05<00:09,  8.23it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_en_projets
Reading doc from disk  ../data/ext_data/websites/total_com/www_careers_total_com_en


 46%|████▌     | 62/136 [00:05<00:09,  7.92it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_identity_governance_biographies
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_climate-change


 47%|████▋     | 64/136 [00:06<00:09,  7.67it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_why-invest-in-total
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_results-investor-presentations_investor-presentations


 49%|████▉     | 67/136 [00:06<00:07,  9.54it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_water
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_identity_history
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_exploration-production_renewable-energies


 51%|█████     | 69/136 [00:06<00:06, 10.10it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_strength_employees
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection


 53%|█████▎    | 72/136 [00:06<00:05, 11.08it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_ship-market_our-trading-and-shipping-operations
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news_communiques-presse_total-enters-the-eca-lng-project
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_strength_integrated-business-model


 54%|█████▍    | 74/136 [00:07<00:05, 10.65it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_strength_deep-geographic-roots
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses_heating


 56%|█████▌    | 76/136 [00:07<00:06,  9.11it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_site-map
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers_service-stations
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_en_media_special-features


 58%|█████▊    | 79/136 [00:07<00:06,  9.23it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_special-features
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_press-releases


 60%|██████    | 82/136 [00:07<00:05, 10.24it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/www_aviation_total_com_
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_climate-change_carbon-neutrality
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_identity_governance


 62%|██████▏   | 84/136 [00:08<00:05, 10.31it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_environment-social-governance
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_transformation-development_polymers


 63%|██████▎   | 86/136 [00:08<00:04, 10.09it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers_solar-energy
Reading doc from disk  ../data/ext_data/websites/total_com/www_ep_total_com_en


 65%|██████▍   | 88/136 [00:08<00:05,  9.59it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news_communiques-presse_total-once-again-selected-in-2020-in-the-dow-jones-sustainability-indices
Reading doc from disk  ../data/ext_data/websites/total_com/www_careers_total_com_en_five-great-reasons-join-us


 67%|██████▋   | 91/136 [00:08<00:04,  9.79it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_shares-and-dividends_ownership-structure
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_publications-and-regulated-information_regulated-information


 68%|██████▊   | 93/136 [00:09<00:04, 10.54it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_shared-development_supplier-relationships-underpinned-by-ethics-and-sustainability
Reading doc from disk  ../data/ext_data/websites/total_com/www_bitumen_total_com_
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_en_accessibility


 71%|███████▏  | 97/136 [00:09<00:03, 11.76it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/www_polymers_total_com_
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_worldwide-presence
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_total-a-major-energy-operator


 73%|███████▎  | 99/136 [00:09<00:03,  9.96it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_shares-and-dividends
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_shareholders-events


 74%|███████▍  | 101/136 [00:09<00:03,  9.45it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_media
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_commitment_climate-change_climate-our-vision


 76%|███████▌  | 103/136 [00:10<00:03,  8.59it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_publications
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_energy-expertise_transformation-development_total-present-across-entire-low-carbon-electricity-value-chain


 77%|███████▋  | 105/136 [00:10<00:03,  9.17it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_shared-development_access-to-energy
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses_energy-efficiency
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_ambition_commitments


 80%|████████  | 109/136 [00:10<00:02, 11.18it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_news_communiques-presse_paris-total-to-operate-2300-ev-charge-points-of-the-belib-network
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_publications-and-regulated-information
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses


 82%|████████▏ | 111/136 [00:10<00:02, 10.06it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_energy-expertise_transformation-development_refining-petrochemical
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_publications-and-regulated-information_other-information


 83%|████████▎ | 113/136 [00:11<00:02,  9.74it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_ambition_challenges
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_environmental-issues-challenges_environment-protection_air


 85%|████████▍ | 115/136 [00:11<00:02,  9.46it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_shares-and-dividends_Total-shares
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_health_products


 86%|████████▌ | 117/136 [00:11<00:02,  8.56it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_ambition
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_home-media


 88%|████████▊ | 120/136 [00:11<00:01,  9.49it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers_heating
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_businesses_natural-gas-and-power
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_identity_five-strong-values-embedded-our-dna


 90%|█████████ | 123/136 [00:12<00:01,  9.99it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/www_marinefuels_total_com_
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers_consumers_natural-gas-and-power
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_customers


 92%|█████████▏| 125/136 [00:12<00:01,  8.60it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_climate-change_growing-natural-gas
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_individual-shareholders_shareholders-advisory-committee


 93%|█████████▎| 127/136 [00:12<00:00,  9.91it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_shared-development
Reading doc from disk  ../data/ext_data/websites/total_com/www_total_com_sites_g_files_nytnzq111_files_documents_2020-10_total-climate-report-2020_pdf
Error :  'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte ../data/odqa_data/documents/www_total_com_sites_g_files_nytnzq111_files_documents_2020-10_total-climate-report-2020_pdf
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_media_media-relations
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_jobseekers


 96%|█████████▌| 130/136 [00:12<00:00, 11.52it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_our-commitment_being-responsible-employer
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_ethics_exemplary-behavior


 97%|█████████▋| 132/136 [00:13<00:00, 10.02it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_protecting-people_health
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_group_commitment_climate-change_petroleum-products-energy-efficiency-biofuels


 99%|█████████▉| 135/136 [00:13<00:00,  8.02it/s]

Reading doc from disk  ../data/ext_data/websites/total_com/www_lubricants_total_com_
Reading doc from disk  ../data/ext_data/websites/total_com/total_com_investors_shareholders-meetings


100%|██████████| 136/136 [00:13<00:00,  9.93it/s]
