## Immobiliare.it web-scraping

In [1]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import requests
from datetime import date
from bs4 import BeautifulSoup

In [2]:
# INITIALIZING ITALIAN REGIONS AND TYPE OF ASSET

regioni = ['abruzzo', 'basilicata', 'campania', 'calabria', 'emilia-romagna', 'friuli-venezia-giulia', 'lazio','liguria', 'lombardia','marche','molise','piemonte','puglia','sardegna','sicilia','toscana', 'trentino-alto-adige','umbria','valle-d-aosta','veneto']
tipo = 'case'

In [3]:
# INITIALIZE THE VARIABLE LISTS

price_list = []
adv_list = []
region = []
rooms = []
area = []
toilets = []

In [4]:
# FUNCTIONS TO GET ASSET DATA FROM TILES

def get_price(soup, price_list):
     price_list.append(soup.find('div', {"class": "in-realEstateListCard__priceOnTop"}).text)



def get_adv(soup, adv_list):
     adv_list.append(soup.find('a', class_="in-card__title").attrs.get('title'))



def get_rooms(soup, rooms_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}) is not None:
          rooms_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}).text)
     else:
          rooms_list.append(np.nan)



def get_area(soup,area_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}) is not None:
          area_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}).text)
     else:
          area_list.append(np.nan)



def get_toilets(soup, toilets_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}) is not None:
          toilets_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}).text)
     else:
          toilets_list.append(1)

In [5]:
# SCRAPING TIME!!!!!!

for reg in regioni:
    
    max_num_pages = 80

    root = f"https://www.immobiliare.it/vendita-{tipo}/{reg}/?criterio=rilevanza&pag="
        
    
    for i in range(1,81):
        
        url = f"{root}{i}"
        print(f"Scraping:\t{url}")
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        house_card = soup.find_all('div', class_="nd-mediaObject__content in-card__content in-realEstateListCard__content")
        
        for _ in house_card:

            #Get the price
            get_price(_,price_list)

            #Get the description
            get_adv(_,adv_list)
            
            #Append the region
            region.append(reg)
            
            #Get how many rooms
            get_rooms(_,rooms)

            #Get the area
            get_area(_,area)

            #Get how many toilets
            get_toilets(_,toilets)

Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=1
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=2
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=3
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=4
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=5
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=6
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=7
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=8
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=9
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=10
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=11
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rileva

In [6]:
# CREATING THE DATAFRAME

df = pd.DataFrame(
    {'advertisement': adv_list,
     'region': region,
     'rooms': rooms,
     'toilets': toilets,
     'area': area,
     'price': price_list
    })

df.sample(10)

Unnamed: 0,advertisement,region,rooms,toilets,area,price
37952,"Quadrilocale Località Senin, Chef Lieu, Saint-...",valle-d-aosta,4,1,85m²,€ 150.000
1209,"Terratetto plurifamiliare Str. Gaglierano 140,...",abruzzo,3,1,90m²,€ 50.000
9579,Quadrilocale Area verde Filanda - RISTRUTTURAT...,emilia-romagna,4,2,120m²,€ 448.000
38335,"Villa unifamiliare via Moratei 15, Vigo, Sovizzo",veneto,5+,3,487m²,€ 698.000
11759,"Villa unifamiliare via 4 Novembre 8, Basiliano...",friuli-venezia-giulia,5+,3+,450m²,€ 390.000
9066,"Trilocale via Martiri della Resistenza, Stadio...",emilia-romagna,3,2,145m²,€ 195.000
13261,"Bilocale via Attilio Benigni, 100, Talenti - M...",lazio,2,1,86m²,€ 228.500
38585,"Villa bifamiliare via Luigi Luzzatti, Borgo Ve...",veneto,5+,3+,309m²,€ 690.000
8928,"Appartamento piazza Natale Bruni, Centro Stori...",emilia-romagna,5+,2,182m²,€ 380.000
9490,"Villa bifamiliare viale Trento Trieste 18, Cen...",emilia-romagna,5+,3+,313m²,€ 2.390.000


In [7]:
# GET THE CITY OUT OF ADV

df['city'] = df["advertisement"].apply(lambda x: x.split(',')[-1])


In [8]:
#SELECT COLUMNS I NEED

df = df[['region','city','area','rooms','toilets','price']]


#GET YEAR AND MONTH OF THE SCRAPING

df['date'] = date.today()
df.sample(10)

Unnamed: 0,region,city,area,rooms,toilets,price,date
34449,umbria,Terni,130m²,4,2,€ 175.000,2023-07-31
31324,toscana,Empoli,103m²,3,1,€ 209.000,2023-07-31
29092,sicilia,Palermo,110m²,4,2,€ 450.000,2023-07-31
10605,friuli-venezia-giulia,Morsano al Tagliamento,70m²,2,1,€ 58.000,2023-07-31
9231,emilia-romagna,Ravenna,124m²,5+,2,€ 210.000,2023-07-31
27216,sardegna,Aglientu,88m²,4,1,€ 260.000,2023-07-31
3566,basilicata,Potenza,150m²,5+,1,€ 95.000,2023-07-31
9882,emilia-romagna,Reggio Emilia,240m²,4,3+,Prezzo su richiesta,2023-07-31
13782,lazio,Roma,90m²,3,2,€ 205.000,2023-07-31
24698,puglia,Molfetta,82m²,2,1,€ 209.000,2023-07-31


In [9]:
#SAVE INTO CSV FORMAT

df.to_csv(f'house_prices_italy_{date.today().year}_{date.today().month}.csv')