# Data Extraction

In [None]:
import requests
import re
import sys
import unicodedata
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
import time
automobile_url = "https://www.automobile.it/usate/page-1"
response = requests.get(automobile_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Initialize the empty car list, which will hold all the cars scraped from the website
car_list = []

# The website is structured like this: there is a list of all the cars that are on the market, divided in pages. Each page contains a few cars ads.
# By clicking on an ad, you are then brought to a page with the details related to the car in that ad, like cahracteristics and price.

# Find the maximum number of pages
max_pages = int(soup.find_all('button', class_="jsx-2138479547 font-base auto inline-circled styled value")[-1].text)
# Iterate over the range 1 to max number of pages
for i in range(1, max_pages +1):
    # Construct the url of the page, get the html and parse it
    page_url = "https://www.automobile.it/usate/page-" + str(i)
    page_response = requests.get(page_url)
    print(page_url)
    page_soup = BeautifulSoup(page_response.text, 'html.parser')
    # Extract all the cars ad on the current page and iterate over them
    car_ads_list = page_soup.find_all('a', class_="jsx-2059509079 Card hover-effect CardAd")
    for car_ad in car_ads_list:
        # Create empty car object
        car = {}
        # Extract the url of the details of the ad, get the html and parse it
        car_details_page_url = "https://www.automobile.it" + car_ad.attrs['href']
        print(car_details_page_url)
        car_details_response = requests.get(car_details_page_url)
        car_details_page = BeautifulSoup(car_details_response.text, 'html.parser')
        # Sometimes the links are broken and lead to a 404 page, if that is the case this iteration will be skipped
        if car_details_page.find('div', class_='jsx-1421767171 PageNotFound'):
            continue
        # Get the price and assign it to the car object
        car_price = car_details_page.find('span',class_="jsx-139447011 Price").text
        car['price'] = car_price
        # Get the car characteristics groups and iterate over them
        car_characteristics_groups = car_details_page.find_all('div', class_="jsx-3587327592 Item")
        for characteristic_group in car_characteristics_groups:
            # Get the category name for the characteristics group
            characteristic_category = characteristic_group.find('span').text
            # Initialize an empty list
            characteristics_list = []
            # Find all characteristics and iterate over them if they are more than 1, adding them to the list. Then add it to the car object, with the category as key.
            characteristics = characteristic_group.find_all('div')
            if len(characteristics) > 1:
                for characteristic in characteristics:
                    characteristics_list.append(characteristic.text)
                car[characteristic_category] = characteristics_list
            else:
                car[characteristic_category] = characteristics[0].text
        # Append the new car object to the car list
        car_list.append(car)
        

In [None]:
# Convert the car list to a data frame, and save it to a csv file for safe keeping
df = pd.DataFrame(car_list)
df.to_csv("data.csv", sep="\t")

# Data Wrangling

In [None]:
# Create a data frame starting from the csv file
df = pd.read_csv("data.csv", sep='\t', index_col=0)

In [None]:
df.drop('description', axis=1, inplace=True)
df['price'].replace(to_replace='\.',value='', regex=True, inplace= True)
df['price'].replace(to_replace='€ ',value='', regex=True, inplace=True)
df['price'] = pd.to_numeric(df['price'])