## Immobiliare.it web-scraping

In [11]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [12]:
# INITIALIZING ITALIAN REGIONS AND TYPE OF ASSET

regioni = ['abruzzo', 'basilicata', 'campania', 'calabria', 'emilia-romagna', 'friuli-venezia-giulia', 'lazio','liguria', 'lombardia','marche','molise','piemonte','puglia','sardegna','sicilia','toscana', 'trentino-alto-adige','umbria','valle-d-aosta','veneto']
tipo = 'case'

In [13]:
# INITIALIZE THE VARIABLE LISTS

price_list = []
adv_list = []
region = []
rooms = []
area = []
toilets = []

In [14]:
# FUNCTIONS TO GET ASSET DATA FROM TILES

def get_price(soup, price_list):
     price_list.append(soup.find('div', {"class": "in-realEstateListCard__priceOnTop"}).text)



def get_adv(soup, adv_list):
     adv_list.append(soup.find('a', class_="in-card__title").attrs.get('title'))



def get_rooms(soup, rooms_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}) is not None:
          rooms_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}).text)
     else:
          rooms_list.append(np.nan)



def get_area(soup,area_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}) is not None:
          area_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}).text)
     else:
          area_list.append(np.nan)



def get_toilets(soup, toilets_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}) is not None:
          toilets_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}).text)
     else:
          toilets_list.append(1)

In [15]:
# SCRAPING TIME!!!!!!

for reg in regioni:
    
    max_num_pages = 80

    root = f"https://www.immobiliare.it/vendita-{tipo}/{reg}/?criterio=rilevanza&pag="
        
    
    for i in range(1,81):
        
        url = f"{root}{i}"
        print(f"Scraping:\t{url}")
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        house_card = soup.find_all('div', class_="nd-mediaObject__content in-card__content in-realEstateListCard__content")
        
        for _ in house_card:

            #Get the price
            get_price(_,price_list)

            #Get the description
            get_adv(_,adv_list)
            
            #Append the region
            region.append(reg)
            
            #Get how many rooms
            get_rooms(_,rooms)

            #Get the area
            get_area(_,area)

            #Get how many toilets
            get_toilets(_,toilets)

Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=1
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=2
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=3
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=4
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=5
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=6
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=7
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=8
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=9
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=10
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=11
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rileva

In [16]:
# CREATING THE DATAFRAME

df = pd.DataFrame(
    {'advertisement': adv_list,
     'region': region,
     'rooms': rooms,
     'toilets': toilets,
     'area': area,
     'price': price_list
    })

df.sample(10)

Unnamed: 0,advertisement,region,rooms,toilets,area,price
11687,"Villa a schiera via Cormor Alto 101, Viale Ven...",friuli-venezia-giulia,5+,2,375m²,€ 190.000
20722,"Trilocale viale Manzoni, 116, Vazzieri, Campob...",molise,3,2,98m²,€ 185.000
23114,"Trilocale vicolo San Martino 20, Centro, Vinovo",piemonte,3,2,85m²,€ 199.000
12454,"Quadrilocale via Francesco Mengotti 39, Vigna ...",lazio,4,3,209m²,€ 770.000
34781,"Appartamento all'asta via del Commercio 79, Pa...",umbria,5+,1,169m²,"da € 40.800,00"
5399,"Quadrilocale Nuovo Tempio, Capodichino, Napoli",campania,4,2,120m²,€ 230.000
36372,Appartamento all'asta via Strada della Villett...,valle-d-aosta,,1,50m²,"da € 118.800,00"
15347,"Bilocale Vico Tubino, Centro, Finale Ligure",liguria,2,1,56m²,€ 350.000
23410,"Bilocale via Vipacco 32, Pozzo Strada, Torino",piemonte,2,1,64m²,€ 99.000
10761,"Terratetto unifamiliare Case Presa, Faedis",friuli-venezia-giulia,5,2,100m²,€ 198.000


In [17]:
df['city'] = df["advertisement"].apply(lambda x: x.split(',')[-1])


In [18]:
df = df[['region','city','area','rooms','toilets','price']]
df.sample(10)

Unnamed: 0,region,city,area,rooms,toilets,price
6236,calabria,Melissa,198m²,4,1,€ 150.000
33054,trentino-alto-adige,San Candido,125m²,3,2,€ 774.000
5321,campania,Palma Campania,140m²,5,3,€ 295.000
38302,veneto,Mogliano Veneto,50m²,4,1,€ 93.000
31966,toscana,Bagno a Ripoli,50m²,2,1,€ 190.000
35580,umbria,Collazzone,120m²,4,2,"€ 115.000€ 125.000(-8,0%)"
10430,friuli-venezia-giulia,Udine,275m²,5+,2,€ 425.000
21329,molise,Venafro,95m²,3,2,€ 160.000
39119,veneto,Verona,188m²,5+,2,€ 1.300.000
21220,molise,Petacciato,129m²,5,2,Prezzo su richiesta


In [19]:
#SAVE INTO CSV FORMAT

df.to_csv('house_prices_italy.csv')