***
# Car prices
***

1. Répertorier l'ensemble des marques de véhicules
2. Répertorier l'ensemble des modèles de véhicules
3. Répertorier l'ensemble des prix des véhicules
***

In [16]:
# Module that sets paths
import path

path_data =     path.path_data
path_df =       path.path_df
path_output =   path.path_output

***

In [18]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import warnings
from datetime import date
import re

requests.packages.urllib3.disable_warnings()

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', 1000) 

In [19]:
# ====== Connect to url ====== #
link_original        = 'https://www.guideautoweb.com'
link_constructeur   = 'https://www.guideautoweb.com/constructeurs/'
response_url = requests.get(link_constructeur, verify=False)

if response_url.status_code == 200:
    print('Connection réussi')
else :
    print('Erreur de connexion')

Connection réussi


In [20]:
# ====== 1ere page : Constructeur ====== #
# Extraire toutes les marques et tous leurs liens 

# Get the data from url
html_constructeur = response_url.text
soup_constructeur = BeautifulSoup(html_constructeur, 'html.parser')

# Extraire les marques de la page principale
ul_element = soup_constructeur.find('ul', id='brands-index-list')

data_marque = {
    'marque' : [],
    'link_model' : []
}

# Extract information from <a> elements within the <ul> element
for a_element in ul_element.find_all('a'):
    a_text = a_element.get_text().strip()
    a_href = a_element.get('href')
    link_model = link_original + a_href

    data_marque['marque'].append(a_text)
    data_marque['link_model'].append(link_model)

df = pd.DataFrame(data_marque)
df.head(3)

Unnamed: 0,marque,link_model
0,Acura,https://www.guideautoweb.com/constructeurs/acura/
1,Alfa Romeo,https://www.guideautoweb.com/constructeurs/alfa-romeo/
2,Allard,https://www.guideautoweb.com/constructeurs/allard/


In [21]:
# ====== 2e page : Modèle====== #
# Extraire tous les modèles et leurs liens

# Loop pour Extraire modèle de la seconde page
count = 0
nombre_iteration = 4

data_model = {
    'model' : [],
    'link_model' : [],
    'production' : []
    }

for link in df['link_model']:
    if count >= nombre_iteration:
        break 

    model_url = requests.get(link, verify=False)
    count += 1
    html_model = model_url.text
    soup_model = BeautifulSoup(html_model, 'html.parser')



    # Extract model en production
    section_EnProduction = soup_model.find_all('div', class_='s')

    extracted_text_list = []
    extracted_ref_list = []

    for section in section_EnProduction:
        try:                                                        # Some model have no element, that cause an error
            a_elements = section.find_all('a', class_='e-a e-t')

            for a_element in a_elements:
                extracted_text_list.append(a_element.get_text())
                extracted_ref_list.append(a_element.get('href'))

        except AttributeError:
            continue

    for text, ref in zip(extracted_text_list, extracted_ref_list):
        data_model['model'].append(text)
        data_model['link_model'].append(link_original + ref)
        data_model['production'].append(True)




    # Extracting model non en production
    section_production_autre = soup_model.find('ul', class_='eg eg-t1 eg-sz-s')
    if section_production_autre:
        for a_element in section_production_autre.find_all('a', class_='txt'):
            try:
                text = a_element.get_text()
                data_model['model'].append(text)
                data_model['link_model'].append(link_original + a_element.get('href'))
                data_model['production'].append(False)
                
            except AttributeError:
                continue

    df_model = pd.DataFrame(data_model)

# df_model.to_excel(path_output + 'df_model.xlsx')

df_model.head(10)     

Unnamed: 0,model,link_model,production
0,Acura Integra,https://www.guideautoweb.com/constructeurs/acura/integra/2024/,True
1,Acura MDX,https://www.guideautoweb.com/constructeurs/acura/mdx/2024/,True
2,Acura RDX,https://www.guideautoweb.com/constructeurs/acura/rdx/2024/,True
3,Acura TLX,https://www.guideautoweb.com/constructeurs/acura/tlx/2024/,True
4,Acura ZDX,https://www.guideautoweb.com/constructeurs/acura/zdx/2024/,True
5,Acura CL,https://www.guideautoweb.com/constructeurs/acura/cl/,False
6,Acura Concept,https://www.guideautoweb.com/constructeurs/acura/concept/,False
7,Acura CSX,https://www.guideautoweb.com/constructeurs/acura/csx/,False
8,Acura EL,https://www.guideautoweb.com/constructeurs/acura/el/,False
9,Acura ILX,https://www.guideautoweb.com/constructeurs/acura/ilx/,False


In [22]:
# ====== Extraire le liens vers années et marques et modèles ====== #
df_ModelAndYear = df_model

data_df_ModelAndYear = {
    'link_model_an' : []
}

for index, row in df_ModelAndYear.iterrows():
    AnMod_url = requests.get(row['link_model'], verify=False)
    soup_AnMod = BeautifulSoup(AnMod_url.text, 'html.parser')

    annee_list = []

    h1_section = soup_AnMod.find('h1', class_='st st-s3')
    
    for option in h1_section.find_all('option'):
        data_df_ModelAndYear['link_model_an'].append(link_original + option.get('value'))     

df_model2 = pd.DataFrame(data_df_ModelAndYear)

df_model2
# 11m

Unnamed: 0,link_model_an
0,https://www.guideautoweb.com/constructeurs/acura/integra/2024/
1,https://www.guideautoweb.com/constructeurs/acura/integra/2023/
2,https://www.guideautoweb.com/constructeurs/acura/mdx/2024/
3,https://www.guideautoweb.com/constructeurs/acura/mdx/2023/
4,https://www.guideautoweb.com/constructeurs/acura/mdx/2022/
...,...
177,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2017/
178,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2016/
179,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2015/
180,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2014/


In [23]:
# ====== lien vers la table specification ====== #

# spec_tab = df_model2

spec_tab_dic = {
    'spec_tab' : []
}

for index, row in df_model2.iterrows():
    # print(row['link_model_an'])
    tab_url = requests.get(row['link_model_an'], verify=False)
    soup_tab = BeautifulSoup(tab_url.text, 'html.parser')

    tab = soup_tab.find_all('a', class_='tab')
    tab_link = tab[2].get('href').split('/')      # toujours le 3e élément car il y a 4 onglets
    url_total = row['link_model_an'] + tab_link[5] + '/' + tab_link[6] + '/' 
    spec_tab_dic['spec_tab'].append(url_total)
    
df_tab = pd.DataFrame(spec_tab_dic)
df_tab
# 32m   

Unnamed: 0,spec_tab
0,https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/a-spec/
1,https://www.guideautoweb.com/constructeurs/acura/integra/2023/specifications/base/
2,https://www.guideautoweb.com/constructeurs/acura/mdx/2024/specifications/tech/
3,https://www.guideautoweb.com/constructeurs/acura/mdx/2023/specifications/base/
4,https://www.guideautoweb.com/constructeurs/acura/mdx/2022/specifications/base/
...,...
177,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2017/specifications/coupe/
178,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2016/specifications/coupe/
179,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2015/specifications/coupe/
180,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2014/specifications/coupe/


In [24]:
# ====== lien vers les specifications (Trim) ====== #

spec_link_dict = {
    'spec_link' : []
}

for spec_link in df_tab['spec_tab']:
    spec_url = requests.get(spec_link, verify=False)
    soup_spec = BeautifulSoup(spec_url.text, 'html.parser')

    select_section = soup_spec.find('select', {'name': 'trim'})

    for option in select_section.find_all('option'):
        spec_link_dict['spec_link'].append(link_original + option.get('value'))

spec_link_df = pd.DataFrame(spec_link_dict)

spec_link_df
# 29

Unnamed: 0,spec_link
0,https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/a-spec/
1,https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/elite-a-spec-auto/
2,https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/elite-a-spec-man/
3,https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/type-s/
4,https://www.guideautoweb.com/constructeurs/acura/integra/2023/specifications/base/
...,...
623,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2015/specifications/coupe/
624,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2015/specifications/volante/
625,https://www.guideautoweb.com/constructeurs/aston-martin/vanquish/2014/specifications/coupe/
626,https://www.guideautoweb.com/constructeurs/aston-martin/virage/2012/specifications/coupe/


In [14]:
# spec_link_df.to_hdf(path_df + 'spec_link_df.h5', key='s')

In [37]:
# ====== Prendre toute l'info pertinante ====== #
import numpy as np
# fonction pour extraire l'info
def extraire_info(string_input):
    try:
        row = info_table.find('th', string=string_input).find_parent('tr')
        value = row.find('td').get_text(strip=True).replace('\xa0', '')
        # print(f'{string_input} : ', value)
        return value
    except AttributeError:
        return np.nan


# Créer un dictionnaire pour storer l'info extraite
list_col_name = ['pdsf', 'moteur', 'type_carbu', 'type_vehicule', 'classement', 'eff_ville', 'eff_autoroute' , 'eff_combi', 'eff_autonomie', 'eff_ges', 'masse']
list_value_extract = ['PDSF', 'Moteur', 'Type de carburant', 'Type de véhicule', 'Classement', 'Ville', 'Autoroute' , 'Combinée', 'Autonomie', 'Émissions CO₂', 'Poids']

dict_info = {}
for element in list_col_name:
    dict_info[element] = []
dict_info['lien']= []

info_link = spec_link_df.copy()

for link in info_link['spec_link']:
    info_url = requests.get(link, verify=False)
    soup_info = BeautifulSoup(info_url.text, 'html.parser')

    # Table qui contient toute l'info
    info_table = soup_info.find('table', {'class': 'dt dt-s1'})

    for key,  element in zip(list_col_name, list_value_extract):
        dict_info[key].append(extraire_info(element))
    dict_info['lien'].append(link)

df_info = pd.DataFrame(dict_info)
df_info.head()

# 127

Unnamed: 0,pdsf,moteur,type_carbu,type_vehicule,classement,eff_ville,eff_autoroute,eff_combi,eff_autonomie,eff_ges,masse,lien
0,38850$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1415kg (3120lb),https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/a-spec/
1,44350$,4L 1.5l,Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1434kg (3161lb),https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/elite-a-spec-auto/
2,44350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,9L/100km","6,5L/100km","7,8L/100km",589km,181g/km,1399kg (3084lb),https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/elite-a-spec-man/
3,55600$,4L 2.0l,Super,Hatchback,"Compacte sportive, Sous-compacte de luxe","11,1L/100km","8,3L/100km","9,9L/100km",464km,230g/km,1460kg (3219lb),https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/type-s/
4,34350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"7,9L/100km","6,3L/100km","7,2L/100km",638km,167g/km,1402kg (3091lb),https://www.guideautoweb.com/constructeurs/acura/integra/2023/specifications/base/


In [34]:
list_col_name = ['pdsf', 'moteur', 'type_carbu', 'type_vehicule', 'classement', 'eff_ville', 'eff_autoroute' , 'eff_combi', 'eff_autonomie', 'eff_ges', 'masse']
list_value_extract = ['PDSF', 'Moteur', 'Type de carburant', 'Type de véhicule', 'Classement', 'Ville', 'Autoroute' , 'Combinée', 'Autonomie', 'Émissions CO₂', 'Poids']

dict_info = {}
for element in list_col_name:
    dict_info[element] = []
dict_info['lien']= []

info_link = spec_link_df.copy()

for link in info_link['spec_link']:
    info_url = requests.get(link, verify=False)
    soup_info = BeautifulSoup(info_url.text, 'html.parser')

    # Table qui contient toute l'info
    info_table = soup_info.find('table', {'class': 'dt dt-s1'})

    for key,  element in zip(list_col_name, list_value_extract):
        dict_info[key].append(extraire_info(element))

    dict_info['lien'] = link

dict_info

{'pdsf': ['38850$'],
 'moteur': ['4L 1,5l'],
 'type_carbu': ['Super'],
 'type_vehicule': ['Hatchback'],
 'classement': ['Sous-compacte de luxe'],
 'eff_ville': ['8,1L/100km'],
 'eff_autoroute': ['6,5L/100km'],
 'eff_combi': ['7,4L/100km'],
 'eff_autonomie': ['621km'],
 'eff_ges': ['172g/km'],
 'masse': ['1415kg (3120lb)'],
 'lien': 'https://www.guideautoweb.com/constructeurs/acura/integra/2024/specifications/a-spec/'}

In [15]:
df_info

Unnamed: 0,marque,modele,annee,spec,pdsf,moteur,type_carbu,type_vehicule,classement,eff_ville,eff_autoroute,eff_combi,eff_autonomie,eff_ges,masse,lien
0,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,38850$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1415kg (3120lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
1,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1434kg (3161lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
2,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,9L/100km","6,5L/100km","7,8L/100km",589km,181g/km,1399kg (3084lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
3,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,55600$,"4L 2,0l",Super,Hatchback,"Compacte sportive, Sous-compacte de luxe","11,1L/100km","8,3L/100km","9,9L/100km",464km,230g/km,1460kg (3219lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
4,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,34350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"7,9L/100km","6,3L/100km","7,2L/100km",638km,167g/km,1402kg (3091lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30048,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,49995$,,,Familiale,n.d.,"13,8L/100km","8,9L/100km","11,6L/100km",,267g/km,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30049,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,55995$,"6L 3,0l","Ordinaire, Super",Familiale,n.d.,"13,0L/100km","9,0L/100km","11,2L/100km",625km,258g/km,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30050,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44095$,,,Familiale,n.d.,"13,7L/100km","8,8L/100km","11,5L/100km",,n.d.,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30051,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,45595$,,,Familiale,n.d.,"13,7L/100km","8,8L/100km","11,5L/100km",,n.d.,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/


In [10]:
df_info.to_hdf(path_df + 'df_info.h5', key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['pdsf', 'moteur', 'type_carbu', 'type_vehicule', 'classement',
       'eff_ville', 'eff_autoroute', 'eff_combi', 'eff_autonomie', 'eff_ges',
       'masse', 'lien'],
      dtype='object')]

  df_info.to_hdf(path_df + 'df_info.h5', key='s')


In [11]:
# ====== Nettoyage ====== #
col_dict = {
    'marque' : 4,
    'modele' : 5,
    'annee' : 6,
    'spec' : 8
} 

df_info['split'] = [x.split('/') for x in df_info['lien']]

for key, value in col_dict.items():
    df_info[key] = [x[value] for x in df_info['split']]


order_columns = ['marque', 'modele', 'annee', 'spec', 'pdsf', 'moteur', 'type_carbu', 'type_vehicule', 'classement', 'eff_ville', 'eff_autoroute', 'eff_combi', 'eff_autonomie', 'eff_ges', 'masse', 'lien']
df_info = df_info[order_columns]
df_info

Unnamed: 0,marque,modele,annee,spec,pdsf,moteur,type_carbu,type_vehicule,classement,eff_ville,eff_autoroute,eff_combi,eff_autonomie,eff_ges,masse,lien
0,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,38850$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1415kg (3120lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
1,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,1L/100km","6,5L/100km","7,4L/100km",621km,172g/km,1434kg (3161lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
2,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"8,9L/100km","6,5L/100km","7,8L/100km",589km,181g/km,1399kg (3084lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
3,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,55600$,"4L 2,0l",Super,Hatchback,"Compacte sportive, Sous-compacte de luxe","11,1L/100km","8,3L/100km","9,9L/100km",464km,230g/km,1460kg (3219lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
4,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,34350$,"4L 1,5l",Super,Hatchback,Sous-compacte de luxe,"7,9L/100km","6,3L/100km","7,2L/100km",638km,167g/km,1402kg (3091lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30048,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,49995$,,,Familiale,n.d.,"13,8L/100km","8,9L/100km","11,6L/100km",,267g/km,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30049,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,55995$,"6L 3,0l","Ordinaire, Super",Familiale,n.d.,"13,0L/100km","9,0L/100km","11,2L/100km",625km,258g/km,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30050,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,44095$,,,Familiale,n.d.,"13,7L/100km","8,8L/100km","11,5L/100km",,n.d.,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/
30051,volvo,xc70,2009,5prt-wgn-3.0-l-avec-toit-ouvrant,45595$,,,Familiale,n.d.,"13,7L/100km","8,8L/100km","11,5L/100km",,n.d.,1892kg (4171lb),https://www.guideautoweb.com/constructeurs/volvo/xc70/2009/specifications/5prt-wgn-3.0-l-avec-toit-ouvrant/


In [12]:
# ====== Sauveguarde ====== #
df_info.to_hdf(path_df + 'scrap_data.h5', key='s')
df_info.to_excel(path_output + 'scrap_data.xlsx')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['marque', 'modele', 'annee', 'spec', 'pdsf', 'moteur', 'type_carbu',
       'type_vehicule', 'classement', 'eff_ville', 'eff_autoroute',
       'eff_combi', 'eff_autonomie', 'eff_ges', 'masse', 'lien'],
      dtype='object')]

  df_info.to_hdf(path_df + 'scrap_data.h5', key='s')
