

[lacentrale](https://www.lacentrale.fr/listing)



# Les voitures selon [lacentrale.fr](https://www.lacentrale.fr/listing)

<img src= ''>

L'objectif de ce projet est de collecter les données des voitures présents sur la page `https://www.lacentrale.fr/listing` du site lacentrale.fr.

Les données que nous collecterons seront les suivantes :
name
price
year
origin
registration_date
technical_inspection
first_hand
mileage
fuel_type
transmission
num_doors
num_seats
power
co2_emission
length
trunk_volume
critair_rating
combined_consumption

# Lacentrale

### Différentes Méthodes utilisées pour le scrapping

In [1]:
import requests
from lxml import etree
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from tqdm import tqdm


# Function to fetch a page
def fetch_page(url, page=None):
    if page is not None:
        url = f"{url}?page={page}"
    with requests.Session() as session:
        response = session.get(url)
        soup = BeautifulSoup(response.content, features="lxml")
        return soup


# Function to extract details of a car
def extract_car_details(car_card):
    car = {
        "name": None,
        "price": None,
        "year": None,
        "origin": None,
        "registration_date": None,
        "technical_inspection": None,
        "first_hand": None,
        "mileage": None,
        "fuel_type": None,
        "transmission": None,
        "num_doors": None,
        "num_seats": None,
        "power": None,
        "co2_emission": None,
        "length": None,
        "trunk_volume": None,
        "critair_rating": None,
        "combined_consumption": None,
    }

    dom = etree.HTML(str(car_card))

    if (
        car_card.find(
            "div",
            class_="Text_Text_text SummaryInformation_title__5CYhW Text_Text_headline3",
        )
        != None
    ):
        car["name"] = car_card.find(
            "div",
            class_="Text_Text_text SummaryInformation_title__5CYhW Text_Text_headline3",
        ).text.strip()

    if car_card.find("span", class_="PriceInformation_classifiedPrice__b-Jae") != None:
        car["price"] = car_card.find(
            "span", class_="PriceInformation_classifiedPrice__b-Jae"
        ).text.strip()

    if len(dom.xpath('//*[@id="year"]/span[2]/span')) > 0:
        car["year"] = dom.xpath('//*[@id="year"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="origin"]/span[2]/span[1]')) > 0:
        car["origin"] = dom.xpath('//*[@id="origin"]/span[2]/span[1]')[0].text.strip()

    if len(dom.xpath('//*[@id="firstCirculationDate"]/span[2]/span')) > 0:
        car["registration_date"] = dom.xpath(
            '//*[@id="firstCirculationDate"]/span[2]/span'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="technicalControl"]/span[2]/span[1]')) > 0:
        car["technical_inspection"] = dom.xpath(
            '//*[@id="technicalControl"]/span[2]/span[1]'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="firstHand"]/span[2]/span[1]')) > 0:
        car["first_hand"] = dom.xpath('//*[@id="firstHand"]/span[2]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="mileage"]/span[2]/span[1]')) > 0:
        car["mileage"] = dom.xpath('//*[@id="mileage"]/span[2]/span[1]')[0].text.strip()

    if len(dom.xpath('//*[@id="energy"]/span[2]/span')) > 0:
        car["fuel_type"] = dom.xpath('//*[@id="energy"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="gearbox"]/span[2]/span')) > 0:
        car["transmission"] = dom.xpath('//*[@id="gearbox"]/span[2]/span')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="doors"]/span[2]/span')) > 0:
        car["num_doors"] = dom.xpath('//*[@id="doors"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="seats"]/span[2]/span')) > 0:
        car["num_seats"] = dom.xpath('//*[@id="seats"]/span[2]/span')[0].text.strip()

    if len(dom.xpath('//*[@id="ratedHorsePower"]/span[2]/span[1]')) > 0:
        car["power"] = dom.xpath('//*[@id="ratedHorsePower"]/span[2]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="co2"]/span[2]/div/div/div')) > 0:
        car["co2_emission"] = dom.xpath('//*[@id="co2"]/span[2]/div/div/div')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="length"]/span[2]/span')) > 0:
        car["length"] = dom.xpath('//*[@id="length"]/span[2]/span')[0].text.strip()

    if (
        len(
            dom.xpath(
                '//*[@id="general-information"]/div/div[2]/div/div[2]/div[1]/section/ul/li[14]/div/button/span'
            )
        )
        > 0
    ):
        car["trunk_volume"] = dom.xpath(
            '//*[@id="general-information"]/div/div[2]/div/div[2]/div[1]/section/ul/li[14]/div/button/span'
        )[0].text.strip()

    if len(dom.xpath('//*[@id="critAir"]/span[2]/span[1]/span[1]')) > 0:
        car["critair_rating"] = dom.xpath('//*[@id="critAir"]/span[2]/span[1]/span[1]')[
            0
        ].text.strip()

    if len(dom.xpath('//*[@id="consumption"]/span[2]')) > 0:
        car["combined_consumption"] = dom.xpath('//*[@id="consumption"]/span[2]')[
            0
        ].text.strip()

    return car


# Function to get cars from a page
def get_cars_from_page(soup):
    cars = soup.find_all("div", class_=["searchCardContainer", "boostVo__container"])
    data = []
    for car in cars:
        car_url = urljoin(
            "https://www.lacentrale.fr",
            car.find("a", class_="Vehiculecard_Vehiculecard_vehiculeCard")["href"],
        )
        page = fetch_page(car_url)
        data.append(extract_car_details(page))
    return data

### Lancement du scrapping

In [2]:
# URL of the site to scrape
url = "https://www.lacentrale.fr/listing"
max_page = 500
cars = []

# Retrieve car details from all pages
for current_page in tqdm(range(1, max_page + 1)):
    try:
        page = fetch_page(url, str(current_page))
        car_data = get_cars_from_page(page)
        cars.extend(car_data)
    except requests.RequestException as e:
        print(f"Failed to fetch page {current_page}: {e}")
        pass

# Create a DataFrame from the collected car data
df = pd.DataFrame(cars)
df

100%|██████████| 500/500 [1:02:27<00:00,  7.50s/it]


Unnamed: 0,name,price,year,origin,registration_date,technical_inspection,first_hand,mileage,fuel_type,transmission,num_doors,num_seats,power,co2_emission,length,trunk_volume,critair_rating,combined_consumption
0,NISSAN NAVARA II,36 880 €,2018,France,27/11/2018,Requis,Non,23 395 km,Diesel,Manuelle,4,5,8,E,"5,30 m",,2,6
1,MINI MINI III 3P,18 620 €,2017,France,23/09/2017,Requis,Non,125 000 km,Diesel,Manuelle,3,4,5,A,"3,82 m",,2,31
2,FORD PUMA 2,31 820 €,2024,Importé,19/07/2024,Non requis,Non,12 km,Hybride essence électrique,Manuelle,5,,6,C,,,Euro 1 et avant,
3,VOLKSWAGEN COCCINELLE phase 2,30 990 €,2020,France,03/12/2020,Non requis,Non,64 210 km,Essence,Automatique,2,4,8,C,"4,29 m",,1,49
4,AUDI Q5 II SPORTBACK phase 2,82 860 €,2022,France,05/09/2022,Non requis,Oui,16 464 km,Hybride essence électrique,Automatique,5,5,16,A,"4,69 m",,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8494,VOLVO XC40,38 410 €,2021,France,25/03/2021,Non requis,Non,33 185 km,Diesel,Automatique,5,5,8,D,"4,42 m",460,2,6
8495,PEUGEOT 208 II,21 380 €,2022,France,23/02/2022,Non requis,Non,13 274 km,Essence,Manuelle,5,5,5,B,"4,05 m",311,1,53
8496,PEUGEOT 308 III,41 730 €,2023,France,10/05/2023,Non requis,Oui,19 360 km,Diesel,Automatique,5,5,7,B,"4,37 m",412,2,45
8497,PEUGEOT RIFTER,28 870 €,2021,Importé,21/01/2022,Non requis,Non,65 383 km,Diesel,Manuelle,5,,7,D,,,Euro 1 et avant,


### Nettoyage et exploration de la data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8499 entries, 0 to 8498
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   name                  8499 non-null   object
 1   price                 8499 non-null   object
 2   year                  8499 non-null   object
 3   origin                8499 non-null   object
 4   registration_date     8499 non-null   object
 5   technical_inspection  8499 non-null   object
 6   first_hand            8499 non-null   object
 7   mileage               8499 non-null   object
 8   fuel_type             8499 non-null   object
 9   transmission          8499 non-null   object
 10  num_doors             8444 non-null   object
 11  num_seats             7725 non-null   object
 12  power                 8484 non-null   object
 13  co2_emission          7965 non-null   object
 14  length                7670 non-null   object
 15  trunk_volume          2409 non-null   

In [5]:
df.describe()

Unnamed: 0,name,price,year,origin,registration_date,technical_inspection,first_hand,mileage,fuel_type,transmission,num_doors,num_seats,power,co2_emission,length,trunk_volume,critair_rating,combined_consumption
count,8499,8499,8499,8499,8499,8499,8499,8499,8499,8499,8444,7725,8484,7965,7670,2409,8353,6097
unique,1247,3209,42,2,2808,2,2,5448,8,2,4,10,70,7,201,236,6,139
top,PEUGEOT 208 II,34 980 €,2022,France,24/03/2016,Non requis,Non,12 km,Essence,Automatique,5,5,7,C,"4,45 m",380,1,51
freq,188,77,1380,7350,70,6335,4985,197,3961,5228,6763,6154,1474,2595,329,111,4273,252


### Enregistrement en base

In [None]:
# api_url = "http://localhost:8000/api/car_data/"

# # Convert the DataFrame to a list of dictionaries
# car_data_list = df.to_dict(orient='records')

# # Send a POST request to the API endpoint
# try:
#     response = requests.post(api_url, json=car_data_list)
#     if response.status_code == 201:
#         print("Data successfully posted to the API.")
#     else:
#         print("Failed to post data. Status code:", response.status_code)
# except requests.RequestException as e:
#     print("An error occurred while sending the POST request:", str(e))