

[lacentrale](https://www.lacentrale.fr/listing)



# Les voitures selon [lacentrale.fr](https://www.lacentrale.fr/listing)

<img src= ''>

L'objectif de ce projet est de collecter les données des voitures présents sur la page `https://www.lacentrale.fr/listing` du site lacentrale.fr.

Les données que nous collecterons seront les suivantes :
name
price
year
origin
registration_date
technical_inspection
first_hand
mileage
fuel_type
transmission
num_doors
num_seats
power
co2_emission
length
trunk_volume
critair_rating
combined_consumption

# Lacentrale

### Différentes Méthodes utilisées pour le scrapping

In [7]:
import requests
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm


# Function to fetch a page
def fetch_page(url, page=None):
    if page is not None:
        url = f"{url}?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, features="lxml")
    return soup


# Function to extract details of a car
def extract_car_details(car_card):
    car = {
        "name": None,
        "price": None,
        "year": None,
        "origin": None,
        "registration_date": None,
        "technical_inspection": None,
        "first_hand": None,
        "mileage": None,
        "fuel_type": None,
        "transmission": None,
        "num_doors": None,
        "num_seats": None,
        "power": None,
        "co2_emission": None,
        "length": None,
        "trunk_volume": None,
        "critair_rating": None,
        "combined_consumption": None,
    }

    dom = etree.HTML(str(car_card))

    if (
        car_card.find(
            "div",
            class_="Text_Text_text SummaryInformation_title__5CYhW Text_Text_headline3",
        )
        != None
    ):
        car["name"] = car_card.find(
            "div",
            class_="Text_Text_text SummaryInformation_title__5CYhW Text_Text_headline3",
        ).text.strip()

    if car_card.find("span", class_="PriceInformation_classifiedPrice__b-Jae") != None:
        car["price"] = car_card.find(
            "span", class_="PriceInformation_classifiedPrice__b-Jae"
        ).text.strip()

    if len(dom.xpath('//*[@id="year"]/span[2]/span')) > 0:
        car["year"] = dom.xpath('//*[@id="year"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="origin"]/span[2]/span[1]')) > 0:
        car["origin"] = dom.xpath('//*[@id="origin"]/span[2]/span[1]')[0].text.strip()

    if len(dom.xpath('//*[@id="firstCirculationDate"]/span[2]/span')) > 0:
        car["registration_date"] = dom.xpath(
            '//*[@id="firstCirculationDate"]/span[2]/span'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="technicalControl"]/span[2]/span[1]')) > 0:
        car["technical_inspection"] = dom.xpath(
            '//*[@id="technicalControl"]/span[2]/span[1]'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="firstHand"]/span[2]/span[1]')) > 0:
        car["first_hand"] = dom.xpath('//*[@id="firstHand"]/span[2]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="mileage"]/span[2]/span[1]')) > 0:
        car["mileage"] = dom.xpath('//*[@id="mileage"]/span[2]/span[1]')[0].text.strip()

    if len(dom.xpath('//*[@id="energy"]/span[2]/span')) > 0:
        car["fuel_type"] = dom.xpath('//*[@id="energy"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="gearbox"]/span[2]/span')) > 0:
        car["transmission"] = dom.xpath('//*[@id="gearbox"]/span[2]/span')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="doors"]/span[2]/span')) > 0:
        car["num_doors"] = dom.xpath('//*[@id="doors"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="seats"]/span[2]/span')) > 0:
        car["num_seats"] = dom.xpath('//*[@id="seats"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="ratedHorsePower"]/span[2]/span[1]')) > 0:
        car["power"] = dom.xpath('//*[@id="ratedHorsePower"]/span[2]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="co2"]/span[2]/div/div/div')) > 0:
        car["co2_emission"] = dom.xpath('//*[@id="co2"]/span[2]/div/div/div')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="length"]/span[2]/span')) > 0:
        car["length"] = dom.xpath('//*[@id="length"]/span[2]/span')[0].text.strip()

    if (
        len(
            dom.xpath(
                '//*[@id="general-information"]/div/div[2]/div/div[2]/div[1]/section/ul/li[14]/div/button/span'
            )
        )
        > 0
    ):
        car["trunk_volume"] = dom.xpath(
            '//*[@id="general-information"]/div/div[2]/div/div[2]/div[1]/section/ul/li[14]/div/button/span'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="critAir"]/span[2]/span[1]/span[1]')) > 0:
        car["critair_rating"] = dom.xpath('//*[@id="critAir"]/span[2]/span[1]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="consumption"]/span[2]')) > 0:
        car["combined_consumption"] = dom.xpath('//*[@id="consumption"]/span[2]')[
            0
        ].text.strip()

    return car


# Function to get cars from a page
def get_cars_from_page(soup):
    cars = soup.find_all("div", class_=["searchCardContainer", "boostVo__container"])
    data = []
    for car in cars:
        car_url = (
            "https://www.lacentrale.fr"
            + car.find("a", class_="Vehiculecard_Vehiculecard_vehiculeCard")["href"]
        )
        page = fetch_page(car_url)
        data.append(extract_car_details(page))
    return data

### Lancement du scrapping

In [8]:
# URL of the site to scrape
url = "https://www.lacentrale.fr/listing"
max_page = 500
cars = []

# Retrieve car details from all pages
for current_page in tqdm(range(1, max_page + 1)):
    try:
        page = fetch_page(url, str(current_page))
        car_data = get_cars_from_page(page)
        cars.extend(car_data)
    except requests.RequestException as e:
        print(f"Failed to fetch page {current_page}: {e}")
        pass

# Create a DataFrame from the collected car data
df = pd.DataFrame(cars)
df

100%|██████████| 300/300 [36:22<00:00,  7.28s/it]


Unnamed: 0,name,price,year,origin,registration_date,technical_inspection,first_hand,mileage,fuel_type,transmission,num_doors,num_seats,power,co2_emission,length,trunk_volume,critair_rating,combined_consumption
0,RENAULT CAPTUR phase 2,17 910 €,2020,France,28/05/2020,Non requis,Oui,88 480 km,Essence,Automatique,5,5,8,C,"4,12 m",,1,53
1,RENAULT ARKANA,33 510 €,2023,Importé,10/09/2023,Non requis,Non,23 668 km,Hybride essence électrique,Automatique,5,,7,C,,,Euro 1 et avant,
2,FORD PUMA II,24 180 €,2022,France,05/02/2022,Non requis,Oui,32 021 km,Essence,Manuelle,5,5,6,C,"4,19 m",,1,53
3,RENAULT CLIO IV ESTATE phase 2,12 760 €,2019,France,04/08/2019,Requis,Non,98 278 km,Diesel,Automatique,5,5,4,A,"4,06 m",,2,33
4,CITROEN C3 AIRCROSS phase 2,17 310 €,2022,France,23/12/2022,Non requis,Non,40 723 km,Essence,Manuelle,5,5,6,C,"4,16 m",410,1,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,PEUGEOT 3008 II phase 2,32 760 €,2023,France,01/01/2024,Non requis,Oui,10 925 km,Essence,Manuelle,5,5,7,C,"4,45 m",520,1,62
5096,SEAT TARRACO,30 460 €,2020,France,31/05/2020,Non requis,Oui,70 331 km,Diesel,Manuelle,5,7,8,C,"4,74 m",230,2,59
5097,VOLKSWAGEN POLO VI,18 130 €,2019,France,10/11/2019,Non requis,Oui,81 740 km,Essence,Manuelle,5,5,5,B,"4,05 m",,1,46
5098,AUDI RS5 II,76 500 €,2018,France,14/12/2018,Requis,Non,73 111 km,Essence,Automatique,2,4,34,E,"4,72 m",,1,71


### Nettoyage et exploration de la data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   name                  5100 non-null   object
 1   price                 5100 non-null   object
 2   year                  5100 non-null   object
 3   origin                5100 non-null   object
 4   registration_date     5100 non-null   object
 5   technical_inspection  5100 non-null   object
 6   first_hand            5100 non-null   object
 7   mileage               5100 non-null   object
 8   fuel_type             5100 non-null   object
 9   transmission          5100 non-null   object
 10  num_doors             5054 non-null   object
 11  num_seats             4508 non-null   object
 12  power                 5095 non-null   object
 13  co2_emission          4826 non-null   object
 14  length                4454 non-null   object
 15  trunk_volume          872 non-null    

In [14]:
df.describe()

Unnamed: 0,name,price,year,origin,registration_date,technical_inspection,first_hand,mileage,fuel_type,transmission,num_doors,num_seats,power,co2_emission,length,trunk_volume,critair_rating,combined_consumption
count,5100,5100,5100,5100,5100,5100,5100,5100,5100,5100,5054,4508,5095,4826,4454,872,4969,3730
unique,960,2302,43,2,2296,2,2,3521,9,2,4,10,69,7,178,143,7,126
top,PEUGEOT 208 II,30 400 €,2020,France,16/08/2016,Non requis,Non,11 km,Essence,Automatique,5,5,7,C,"4,45 m",356,1,53
freq,122,64,900,4126,59,3632,2903,115,2650,2949,4010,3439,1010,1499,179,49,2566,181


### Optimisation du scrapping

In [15]:
# from concurrent.futures import ThreadPoolExecutor

# max_page = 500
# cars = []

# # Retrieve car details from all pages using ThreadPoolExecutor for parallel processing
# with ThreadPoolExecutor() as executor:
#     pages = [fetch_page(url, str(current_page)) for current_page in range(1, max_page + 1)]
#     car_data = list(tqdm(executor.map(get_cars_from_page, pages), total=len(pages)))

# # Flatten the list of lists into a single list
# cars = [car for sublist in car_data for car in sublist]
# df_optimized = pd.DataFrame(cars)
# df_optimized

In [None]:
# df_optimized.info()

### Enregistrement en base

In [None]:
# api_url = "http://localhost:8000/api/car_data/"

# # Convert the DataFrame to a list of dictionaries
# car_data_list = df.to_dict(orient='records')

# # Send a POST request to the API endpoint
# try:
#     response = requests.post(api_url, json=car_data_list)
#     if response.status_code == 201:
#         print("Data successfully posted to the API.")
#     else:
#         print("Failed to post data. Status code:", response.status_code)
# except requests.RequestException as e:
#     print("An error occurred while sending the POST request:", str(e))