## Immobiliare.it web-scraping

In [10]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import requests
from datetime import date
from bs4 import BeautifulSoup

In [11]:
# INITIALIZING ITALIAN REGIONS AND TYPE OF ASSET

regioni = ['abruzzo', 'basilicata', 'campania', 'calabria', 'emilia-romagna', 'friuli-venezia-giulia', 'lazio','liguria', 'lombardia','marche','molise','piemonte','puglia','sardegna','sicilia','toscana', 'trentino-alto-adige','umbria','valle-d-aosta','veneto']
tipo = 'case'

In [12]:
# INITIALIZE THE VARIABLE LISTS

price_list = []
adv_list = []
region = []
rooms = []
area = []
toilets = []

In [13]:
# FUNCTIONS TO GET ASSET DATA FROM TILES

def get_price(soup, price_list):
     price_list.append(soup.find('div', {"class": "in-realEstateListCard__priceOnTop"}).text)



def get_adv(soup, adv_list):
     adv_list.append(soup.find('a', class_="in-card__title").attrs.get('title'))



def get_rooms(soup, rooms_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}) is not None:
          rooms_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "locali"}).text)
     else:
          rooms_list.append(np.nan)



def get_area(soup,area_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}) is not None:
          area_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "superficie"}).text)
     else:
          area_list.append(np.nan)



def get_toilets(soup, toilets_list):
     if soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}) is not None:
          toilets_list.append(soup.find('li', {"class" : "nd-list__item in-feat__item", "aria-label": "bagni"}).text)
     else:
          toilets_list.append(1)

In [14]:
# SCRAPING TIME!!!!!!

for reg in regioni:
    
    max_num_pages = 80

    root = f"https://www.immobiliare.it/vendita-{tipo}/{reg}/?criterio=rilevanza&pag="
        
    
    for i in range(1,81):
        
        url = f"{root}{i}"
        print(f"Scraping:\t{url}")
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        house_card = soup.find_all('div', class_="nd-mediaObject__content in-card__content in-realEstateListCard__content")
        
        for _ in house_card:

            #Get the price
            get_price(_,price_list)

            #Get the description
            get_adv(_,adv_list)
            
            #Append the region
            region.append(reg)
            
            #Get how many rooms
            get_rooms(_,rooms)

            #Get the area
            get_area(_,area)

            #Get how many toilets
            get_toilets(_,toilets)

Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=1
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=2
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=3
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=4
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=5
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=6
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=7
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=8
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=9
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=10
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rilevanza&pag=11
Scraping:	https://www.immobiliare.it/vendita-case/abruzzo/?criterio=rileva

In [15]:
# CREATING THE DATAFRAME

df = pd.DataFrame(
    {'advertisement': adv_list,
     'region': region,
     'rooms': rooms,
     'toilets': toilets,
     'area': area,
     'price': price_list
    })

df.sample(10)

Unnamed: 0,advertisement,region,rooms,toilets,area,price
25304,"Quadrilocale via Pettinesse, San Vito - Carell...",puglia,4,2,113m²,€ 138.000
10363,"Quadrilocale Borgata Bach 160, Centro, Sappada",friuli-venezia-giulia,4,1,57m²,€ 230.000
2120,"Appartamento all'asta via Trieste, 7, 75014 Gr...",basilicata,4,1,114m²,"da € 32.625,00"
28734,"Quadrilocale via Aldo Moro, Nunziata - Pianett...",sicilia,4,2,120m²,€ 62.000
8164,"Trilocale via Sant'Andrea, Centro, Ozzano dell...",emilia-romagna,3,2,111m²,€ 358.000
10625,"Villa unifamiliare via Paludo, Centro, Fagagna",friuli-venezia-giulia,5+,1,288m²,€ 235.000
21590,"Appartamento ottimo stato, primo piano, Centro...",molise,5,1,105m²,€ 39.000
20722,"Appartamento piazza Molise, Via XXIV Maggio, C...",molise,5,2,145m²,€ 215.000
30966,"Trilocale via di Peretola 599B, Peretola, Firenze",toscana,3,1,65m²,€ 228.000
22206,"Trilocale via Carlo Frasconi snc, Sacro Cuore,...",piemonte,3,1,105m²,€ 99.000


In [16]:
df['city'] = df["advertisement"].apply(lambda x: x.split(',')[-1])


In [17]:
df = df[['region','city','area','rooms','toilets','price']]
df['year'] = date.today().year
df['month'] = date.today().month
df.sample(10)

Unnamed: 0,region,city,area,rooms,toilets,price,year,month
16662,lombardia,Milano,35m²,,1,€ 270.000,2023,7
16073,lombardia,Milano,98m²,3,2,€ 464.000,2023,7
1249,abruzzo,L'Aquila,60m²,3,1,€ 135.000,2023,7
25392,puglia,Turi,122m²,2,2,€ 29.000,2023,7
2935,basilicata,Matera,176m²,5+,3+,Prezzo su richiesta,2023,7
12592,lazio,Roma,144m²,4,2,€ 530.000,2023,7
9860,emilia-romagna,Modena,155m²,5,1,€ 248.000,2023,7
20443,molise,Termoli,95m²,4,2,€ 139.000,2023,7
34546,umbria,Montecchio,80m²,4,1,"da € 27.297,00",2023,7
31440,toscana,Firenze,230m²,5+,3,€ 1.150.000,2023,7


In [18]:
#SAVE INTO CSV FORMAT

df.to_csv(f'house_prices_italy_{date.today().year}_{date.today().month}.csv')