***
# Car prices
***

1. Répertorier l'ensemble des marques de véhicules
2. Répertorier l'ensemble des modèles de véhicules
3. Répertorier l'ensemble des prix des véhicules
***

In [1]:
# Create folders and getting path
import os
cwd = os.getcwd()

# Loop to create folders
folder_names = ['Dataframe', 'Output', 'Data']

folders = {}
for folder_name in folder_names:
    folders[folder_name] = os.path.join(cwd, folder_name)

    if not os.path.exists(os.path.join(cwd, folder_name)):
        os.makedirs(os.path.join(cwd, folder_name))
        print(f'Le dossier « {folder_name} » a été créé')

    else:
        print(f'Le dossier « {folder_name} » est existant')


# Création des variables de path

# Déterminer si os est win ou linux pour définir les path
if os.name == 'nt':
    slash = '\\'
elif os.name == 'posix':
    slash = '/'

path_prog =     cwd + slash
path_data =     folders['Data'] + slash
path_df =       folders['Dataframe'] + slash
path_output =   folders['Output'] + slash

# Mettre \\ pour éviter les erreurs
path_dict = [path_prog, path_data, path_df, path_output]
for path in path_dict:
    path = path.replace('\\','\\\\')


# Détermination de l'année
print()
print(path_prog)
print(path_data)
print(path_df)
print(path_output)

Le dossier « Dataframe » est existant
Le dossier « Output » est existant
Le dossier « Data » est existant

c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Data\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Dataframe\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Output\


***

In [2]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import warnings
from datetime import date
import re

requests.packages.urllib3.disable_warnings()

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', 1000) 

In [3]:
link_original        = 'https://www.guideautoweb.com'
link_constructeur   = 'https://www.guideautoweb.com/constructeurs/'
response_api = requests.get(link_constructeur, verify=False)
print(response_api.status_code)     # 200 is ok

200


In [4]:
# 1ere page : Constructeur
# Extraire toutes les marques et tous leurs liens 

# Get the data from api
html_constructeur = response_api.text
soup_constructeur = BeautifulSoup(html_constructeur, 'html.parser')

# Extraire les marques de la page principale
ul_element = soup_constructeur.find('ul', id='brands-index-list')

data_marque = {
    'marque' : [],
    'link_model' : []
}

# Extract information from <a> elements within the <ul> element
for a_element in ul_element.find_all('a'):
    a_text = a_element.get_text().strip()
    a_href = a_element.get('href')
    link_model = link_original + a_href

    data_marque['marque'].append(a_text)
    data_marque['link_model'].append(link_model)

df = pd.DataFrame(data_marque)
df.head(3)

Unnamed: 0,marque,link_model
0,Acura,https://www.guideautoweb.com/constructeurs/acura/
1,Alfa Romeo,https://www.guideautoweb.com/constructeurs/alfa-romeo/
2,Allard,https://www.guideautoweb.com/constructeurs/allard/


In [5]:
# 2e page : Modèle
# Extraire tous les modèles et leurs liens

# Loop pour Extraire modèle de la seconde page
count = 0
nombre_iteration = 2

data_model = {
    'model' : [],
    'link_model' : [],
    'production' : []
    }

for link in df['link_model']:
    # if count >= nombre_iteration:
    #     break 

    model_api = requests.get(link, verify=False)
    # count += 1
    html_model = model_api.text
    soup_model = BeautifulSoup(html_model, 'html.parser')



    # Extract model en production
    section_EnProduction = soup_model.find_all('div', class_='s')

    extracted_text_list = []
    extracted_ref_list = []

    for section in section_EnProduction:
        try:                                                        # Some model have no element, that cause an error
            a_elements = section.find_all('a', class_='e-a e-t')

            for a_element in a_elements:
                extracted_text_list.append(a_element.get_text())
                extracted_ref_list.append(a_element.get('href'))

        except AttributeError:
            continue

    for text, ref in zip(extracted_text_list, extracted_ref_list):
        data_model['model'].append(text)
        data_model['link_model'].append(link_original + ref)
        data_model['production'].append(True)




    # Extracting model non en production
    section_production_autre = soup_model.find('ul', class_='eg eg-t1 eg-sz-s')
    if section_production_autre:
        for a_element in section_production_autre.find_all('a', class_='txt'):
            try:
                text = a_element.get_text()
                data_model['model'].append(text)
                data_model['link_model'].append(link_original + a_element.get('href'))
                data_model['production'].append(False)
                
            except AttributeError:
                continue

    df_model = pd.DataFrame(data_model)

df_model.to_excel(path_output + 'df_model.xlsx')

df_model.head(10)     

Unnamed: 0,model,link_model,production
0,Acura Integra,https://www.guideautoweb.com/constructeurs/acura/integra/2023/,True
1,Acura MDX,https://www.guideautoweb.com/constructeurs/acura/mdx/2023/,True
2,Acura RDX,https://www.guideautoweb.com/constructeurs/acura/rdx/2023/,True
3,Acura TLX,https://www.guideautoweb.com/constructeurs/acura/tlx/2023/,True
4,Acura CL,https://www.guideautoweb.com/constructeurs/acura/cl/,False
5,Acura Concept,https://www.guideautoweb.com/constructeurs/acura/concept/,False
6,Acura CSX,https://www.guideautoweb.com/constructeurs/acura/csx/,False
7,Acura EL,https://www.guideautoweb.com/constructeurs/acura/el/,False
8,Acura ILX,https://www.guideautoweb.com/constructeurs/acura/ilx/,False
9,Acura NSX,https://www.guideautoweb.com/constructeurs/acura/nsx/,False


In [6]:
# Extraire le liens vers années et marque et modèle
df_ModelAndYear = df_model

data_df_ModelAndYear = {
    'link_model_an' : []
}

for index, row in df_ModelAndYear.iterrows():
    AnMod_api = requests.get(row['link_model'], verify=False)
    soup_AnMod = BeautifulSoup(AnMod_api.text, 'html.parser')

    annee_list = []

    h1_section = soup_AnMod.find('h1', class_='st st-s3')
    
    for option in h1_section.find_all('option'):
        data_df_ModelAndYear['link_model_an'].append(link_original + option.get('value'))     

df_model2 = pd.DataFrame(data_df_ModelAndYear)

df_model2
# 8m

Unnamed: 0,link_model_an
0,https://www.guideautoweb.com/constructeurs/acura/integra/2023/
1,https://www.guideautoweb.com/constructeurs/acura/mdx/2023/
2,https://www.guideautoweb.com/constructeurs/acura/mdx/2022/
3,https://www.guideautoweb.com/constructeurs/acura/mdx/2020/
4,https://www.guideautoweb.com/constructeurs/acura/mdx/2019/
...,...
4342,https://www.guideautoweb.com/constructeurs/volvo/xc70/2013/
4343,https://www.guideautoweb.com/constructeurs/volvo/xc70/2012/
4344,https://www.guideautoweb.com/constructeurs/volvo/xc70/2011/
4345,https://www.guideautoweb.com/constructeurs/volvo/xc70/2010/


In [7]:
# Creation d'une nouvelle df avec toutes les brand, model et year
year_patern = r'\b\d{4}\b'
model_patern = r'/([^/]+)/\d{4}/$'

def extract_year(url):
    match = re.search(year_patern, url)
    if match:
        return match.group()
    else:
        return None

def extract_model(url):
    match = re.search(model_patern, url)
    if match:
        return match.group(1)
    else:
        return None

def extract_brand(url):
    parts = url.split('/')
    if len(parts) >= 5:
        return parts[4]
    else:
        return None
    
# Apply the function to the 'link_model_an' column and create a new 'year' column
df_model2['year'] = df_model2['link_model_an'].apply(extract_year)
df_model2['model'] = df_model2['link_model_an'].apply(extract_model)
df_model2['brand'] = df_model2['link_model_an'].apply(extract_brand)

df_model2

Unnamed: 0,link_model_an,year,model,brand
0,https://www.guideautoweb.com/constructeurs/acura/integra/2023/,2023,integra,acura
1,https://www.guideautoweb.com/constructeurs/acura/mdx/2023/,2023,mdx,acura
2,https://www.guideautoweb.com/constructeurs/acura/mdx/2022/,2022,mdx,acura
3,https://www.guideautoweb.com/constructeurs/acura/mdx/2020/,2020,mdx,acura
4,https://www.guideautoweb.com/constructeurs/acura/mdx/2019/,2019,mdx,acura
...,...,...,...,...
4342,https://www.guideautoweb.com/constructeurs/volvo/xc70/2013/,2013,xc70,volvo
4343,https://www.guideautoweb.com/constructeurs/volvo/xc70/2012/,2012,xc70,volvo
4344,https://www.guideautoweb.com/constructeurs/volvo/xc70/2011/,2011,xc70,volvo
4345,https://www.guideautoweb.com/constructeurs/volvo/xc70/2010/,2010,xc70,volvo


In [8]:
# Extraire le prix et la conso min et max
for index, row in df_model2.iterrows():
    api = requests.get(row['link_model_an'], verify=False)
    html = api.text
    soup = BeautifulSoup(html, 'html.parser')

    # Extraire les valeurs prix et consommation
    value_element = soup.find_all(class_='value')

    # Extraire prix min et prix max
    price_text = value_element[0].get_text().strip()
    prices = re.findall(r'\d+(?:\s?\xa0?\d+)?', price_text)
    price_min = int(prices[0].replace('\xa0', '')) if prices else None
    price_max = int(prices[1].replace('\xa0', '')) if len(prices) > 1 else None

    # Extraire consom min et consom max et mettre none si non disponible
    consom_text = value_element[1].get_text().strip().replace(',', '.')
    consom = re.findall(r'\d+(?:[.,]\d+)?', consom_text)
    consom_min = float(consom[0].replace('\xa0', '')) if consom else None
    consom_max = float(consom[1].replace('\xa0', '')) if len(consom) > 1 else None

    df_model2.at[index, 'prix_min'] = price_min
    df_model2.at[index, 'prix_max'] = price_max
    df_model2.at[index, 'cons_min'] = consom_min
    df_model2.at[index, 'cons_max'] = consom_max

df_model2
# 19m

KeyboardInterrupt: 

In [306]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures

# Assuming you have already loaded your DataFrame df_model2

def process_row(index, row):
    api = requests.get(row['link_model_an'], verify=False)
    html = api.text
    soup = BeautifulSoup(html, 'html.parser')

    value_element = soup.find_all(class_='value')

    price_text = value_element[0].get_text().strip()
    prices = re.findall(r'\d+(?:\s?\xa0?\d+)?', price_text)
    price_min = int(prices[0].replace('\xa0', '')) if prices else None
    price_max = int(prices[1].replace('\xa0', '')) if len(prices) > 1 else None

    consom_text = value_element[1].get_text().strip().replace(',', '.')
    consom = re.findall(r'\d+(?:[.,]\d+)?', consom_text)
    consom_min = float(consom[0].replace('\xa0', '')) if consom else None
    consom_max = float(consom[1].replace('\xa0', '')) if len(consom) > 1 else None

    df_model2.at[index, 'prix_min'] = price_min
    df_model2.at[index, 'prix_max'] = price_max
    df_model2.at[index, 'cons_min'] = consom_min
    df_model2.at[index, 'cons_max'] = consom_max

# Create a ThreadPoolExecutor with a specified number of threads (adjust as needed)
# You can experiment with the number of threads to find the optimal performance
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_row, index, row) for index, row in df_model2.iterrows()]
    
    # Wait for all threads to finish
    concurrent.futures.wait(futures)

# df_model2 is now updated with the processed values
# 3m25

In [305]:
df_model2.to_excel(path_output + 'df_model2.xlsx')

***
# Optimisation 
***

In [242]:
def get_soup(url):
    try:
        response = requests.get(url, verify=False)
        return BeautifulSoup(response.text, 'html')
    except requests.exceptions.RequestException as e:
        print(f'Error fetching data from {url}: {e}')
        return None
    

url_test = 'https://www.guideautoweb.com/constructeurs/acura/mdx/2018/'

def extract_numeric_value(text):
    text = text[0].get_text().strip()


test = get_soup(url_test)
test2 = find_value(test)
test2

In [152]:
df_test['link_year'][1][0]

'https://www.guideautoweb.com/constructeurs/acura/mdx/2023/'