<a href="https://colab.research.google.com/github/andreabazerla/real-estate/blob/main/Real_Estate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Prediction in Milan (Italy) through Deep Learning via immobiliare.it

<img src="https://media.giphy.com/media/gTURHJs4e2Ies/source.gif" />

# Web Scraping: immobiliare.it

## IP Address

In [2]:
print('Google Colab IP Address = ', end='')
!curl ipecho.net/plain

Google Colab IP Address = 35.196.2.69

## Import

In [3]:
import os
import logging
import math
from google.colab import files
import requests
from enum import Enum 
from random import uniform
import time
import datetime
import json
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

## Environment variables

In [4]:
PRODUCTION = True
GET_ADS_LINKS = False
GET_ADS_LIST = True

## Pandas Options

In [5]:
pd.option_context('display.max_rows', None, 'display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Enums

In [6]:
class Contract(Enum):
  VENDITA = 'vendita'
  AFFITTO = 'affitto'
 
class Area(Enum):
  MILANO = 'milano'

## URL

In [7]:
slash = '/'
https = 'https://'
website = 'www.immobiliare.it'
contract = Contract.VENDITA.value + '-case'
area = Area.MILANO.value
sort = '?criterio=rilevanza'
 
url = https + website + slash + contract + slash + area + slash + sort
 
print('url = ' + url)

url = https://www.immobiliare.it/vendita-case/milano/?criterio=rilevanza


## Sleep

In [33]:
sleep_min = 2
sleep_max = 3

def sleep_default():
  time.sleep(uniform(sleep_min, sleep_max))

## Ads Link List

### Get Last Page

In [9]:
def get_last_page(url):
  sleep_default()
  
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
  
    ul_pagination = soup.find("ul", class_ = "pagination pagination__number")
    li_list = ul_pagination.find_all("li")
    last_page = int(li_list[-1].get_text().strip())
  
    return last_page
  
  except requests.exceptions.RequestException as e:
    raise SystemExit(e)

#last_page = get_last_page(url)
#print('Last page = ' + str(last_page))

### Get Ads Links

In [10]:
def get_ads_link_list(url, first_page, last_page):
  ads_link_list = []
  
  pag = first_page
  
  while (pag <= last_page):
    if (pag > 1):
      url = url + '&pag=' + str(pag)
    
    try:
      response = requests.get(url)

      soup = BeautifulSoup(response.content, 'html.parser')
    
      ads_list = soup.find('ul', class_ = 'annunci-list')
      ad_item_list = ads_list.find_all('div', class_ = 'listing-item_body--content')
      for ad_item in ad_item_list:
        a_list = ad_item.find_all("a")
        for a in a_list:
          href = a["href"]
          ads_link_list.append(href)
    
    except Exception as e:
      logging.exception(e)
      print(str(pag))
      pass
    
    pag += 1
 
    sleep_default()
  
  return ads_link_list

In [None]:
if PRODUCTION:
  if GET_ADS_LINKS:
    first_page = 1
    #last_page = 631
    last_page = get_last_page(url)
  
    ads_link_list = get_ads_link_list(url, first_page, last_page)
    ads_link_list = list(dict.fromkeys(ads_link_list))
    
    print('Total number of ads = ' + str(len(ads_link_list)))

### Store Ads Links to CSV

In [None]:
df_links = pd.DataFrame({'Links' : list(ads_link_list)})

csv_links = 'Links_' + str(int(time.time())) + '_' + str(first_page) + '_' + str(last_page) + '.csv'
df_links.to_csv(csv_links, index=False)

### Display Links CSV

In [None]:
display(df_links)

### Download Link CSV

In [None]:
files.download(csv_links)

## Ad

### Ad Title

In [11]:
def get_ad_title(soup):
  titleBlock__title = soup.find('span', class_ = 'im-titleBlock__title')
  return titleBlock__title.get_text()

### Ad Price

In [12]:
def get_ad_price(soup):
  mainFeatures__price = soup.find_all('li', class_ = 'im-mainFeatures__price')
  return mainFeatures__price[0].get_text().replace('\n', '').strip()

### Ad Main Features

In [13]:
def get_ad_main_feature(soup):
  main_features = {}
  
  mainFeatures = soup.find('div', class_ = 'im-mainFeatures')
  
  li_list = mainFeatures.find_all('li')
  for li in li_list[1:]:
    value = li.find('span', class_="im-mainFeatures__value").get_text().replace('\n', '').strip()
    label = li.find('span', class_="im-mainFeatures__label").get_text().replace('\n', '').strip()
    
    if (label == 'bagno' or label == 'bagni'):
      label = 'bagni'
    
    if (label == 'locale' or label == 'locali'):
      label = 'locali'
    
    main_features[label] = value
  
  return main_features

### Ad Description

In [14]:
def get_ad_description(soup):
  description__text = soup.find('div', class_ = 'im-description__text')
  return description__text.get_text()

### Ad Locations

In [21]:
def get_ad_locations(soup):
  location_list = []
  
  titleBlock__link = soup.find('a', class_ = 'im-titleBlock__link')
  if titleBlock__link is None:
    titleBlock__link = soup.find('h1', class_ = 'im-titleBlock__content')

  location = titleBlock__link.find_all('span', class_ = 'im-location')
  
  try:
    area = location[0].get_text().strip()
  except IndexError:
    area = ''
  
  try:
    district = location[1].get_text().strip()
  except IndexError:
    district = ''

  try:
    address = location[2].get_text().strip()
  except IndexError:
    address = ''

  return [area, district, address]

### Ad Feature List

In [15]:
def get_ad_feature_list(soup):
  features = {}
  
  features__list = soup.find_all("dl", class_ = "im-features__list")
  
  for feature_block in features__list:
    feature__title_list = feature_block.find_all('dt', class_ = 'im-features__title')
  
    for feature__title in feature__title_list:
      feature__value = feature__title.findNext('dd')
  
      if ('im-features__tagContainer' in feature__value.get('class')):
        features__tag_array = []

        features__tag_list = soup.find_all('span', class_ = 'im-features__tag')
        for feature__tag in features__tag_list:
          features__tag_array.append(feature__tag.get_text().strip())
  
        features__tag_list_string = ','.join(features__tag_array)
        feature__value_2 = features__tag_list_string
  
      else:
        feature__value_2 = feature__value.get_text().strip()
  
      feature__title_2 = feature__title.get_text().strip()
      features['f_' + feature__title_2] = feature__value_2
  
  return features

### Ad Type

In [16]:
def get_ad(url):
  if 'p-' in url:
    return get_ad_multi(url)
  else:
    return get_ad_single(url)

### Ad Single

In [17]:
def get_ad_single(url):
  ads_list = []
  ad_data = {}

  ad_data['url'] = url

  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    title = get_ad_title(soup);
    ad_data['titolo'] = title

    price = get_ad_price(soup);
    ad_data['prezzo'] = price

    main_features = get_ad_main_feature(soup)
    if main_features:
      ad_data.update(main_features)

    description = get_ad_description(soup);
    ad_data['descrizione'] = description

    area, district, address = get_ad_locations(soup)
    ad_data['area'] = area
    ad_data['quartiere'] = district
    ad_data['indirizzo'] = address

    feature_list = get_ad_feature_list(soup)
    if feature_list:
      ad_data.update(feature_list)

    ad_data['hashcode'] = hash(frozenset(ad_data.items()))

    ads_list.append(ad_data)
  
  except Exception as e:
    logging.exception(e)
    print(url)
    pass

  return ads_list

### Ad Multi

In [18]:
def get_ad_multi(url):
  ads_list = []

  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = get_ad_title(soup);

    area, district, address = get_ad_locations(soup)

    main_features = get_ad_main_feature(soup)

    description = get_ad_description(soup)

    feature_list = get_ad_feature_list(soup)

    properties__list = soup.find('ul', class_ = 'im-properties__list')
    properties__item_list = properties__list.find_all('li', class_ = 'im-properties__item')
    for properties__item in properties__item_list:
      ad_data = {}

      ad_data['url'] = url

      ad_data['titolo'] = title

      ad_data['area'] = area
      ad_data['quartiere'] = district
      ad_data['indirizzo'] = address
      
      price = get_ad_price(properties__item)
      ad_data['prezzo'] = price

      ad_data['descrizione'] = description

      sub_features = get_ad_main_feature(properties__item)
      if sub_features:
        ad_data.update(sub_features)

      title_2 = properties__item.find('p', class_ = 'nd-mediaObject__title').get_text().strip()
      ad_data['titolo_2'] = title_2

      description_2 = properties__item.find('div', class_ = 'im-properties__content').get_text()
      ad_data['descrizione_2'] = description_2

      if feature_list:
        ad_data.update(feature_list)

      ad_data['hashcode'] = hash(frozenset(ad_data.items()))

      ads_list.append(ad_data)

  except Exception as e:
    logging.exception(e)
    print(url)
    pass
  
  return ads_list

### Read Links CSV

In [19]:
df_links = pd.read_csv('Links_1616797839_1_630.csv')
ads_link_list = df_links['Links'].to_list()

### Ads Scraping

In [None]:
if PRODUCTION:
  if GET_ADS_LIST:
    df_ads = pd.DataFrame()

    timestamp = str(int(time.time()))

    first_ad = 13500
    last_ad = 14000
    #last_ad = len(ads_link_list)

    #if (first_ad > last_ad)
    
    ads_csv = 'Ads_' + timestamp + '_' + str(first_ad) + '_' + str(last_ad - 1) + '.csv'

    ads_list = []
    for i in tqdm(range(first_ad, last_ad)):

      try:
        ad_data = get_ad(ads_link_list[i])
        for ad in ad_data:
          ads_list.append(ad)
      except Exception as e:
        logging.exception(e)
        print(i)
        pass
      
      sleep_default()
    
    df_ads = pd.DataFrame(ads_list)
    df_ads.fillna('', inplace=True)
    df_ads.to_csv(ads_csv, index=False)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/85165605/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/84392908/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/85336315/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694246/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/82808727/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/85392708/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/83747071/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694298/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694296/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86106888/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/85446384/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694304/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694314/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86694252/


ERROR:root:'NoneType' object has no attribute 'get_text'
Traceback (most recent call last):
  File "<ipython-input-17-3d084e144b63>", line 11, in get_ad_single
    title = get_ad_title(soup);
  File "<ipython-input-11-9352d4ae9fa6>", line 3, in get_ad_title
    return titleBlock__title.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'


https://www.immobiliare.it/annunci/86698288/


### Display Ads CSV

In [None]:
  display(df_ads)

### Download Ads CSV

In [31]:
files.download(ads_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Concat Ads CSV

In [None]:
ads_folder = 'Ads'
file_list = os.listdir(ads_folder)
ads_files = [file for file in file_list if file.startswith('Ads')]
ads_files.sort()

df_files = [None] * len(ads_files)
for idx, file in enumerate(ads_files):
  df_files[idx] = pd.read_csv(os.path.join(ads_folder, file))

df_final = pd.concat(df_files).drop_duplicates().reset_index(drop=True)

ads_csv_final = 'Ads' + '_' + timestamp + '.csv'
df_final.to_csv(ads_csv_final, index=False)

### Download Ads CSV

In [None]:
files.download(ads_csv_final)

# Clean Ads CSV

## Read Ads CSV

In [None]:
df_ads = pd.read_csv('Ads_1616966209.csv')

## Remove multiple proprieties

In [None]:
df_ads_single = df_ads[~df_ads['url'].str.contains('p-')]
#len(df_ads_single.index)
# 5244 - 4873 = 371

## Print all DataFrame Columns

In [None]:
columns = list(df_ads_single.columns.values)
for column in columns:
  print(column)

## Remove useless Columns

In [None]:
columns_useless = [
  'area',
  'titolo_2',
  'descrizione_2',
  'f_immobile garantito',
  'f_contratto',
  'f_unità',
  'f_Data di inizio lavori e di consegna prevista',
  'f_Indice prest. energetica rinnovabile',
  'f_disponibilità',
  'f_certificazione energetica',
  'f_numero immobili',
  'f_aggiornato il',
  'data vendita',
  'f_Tipo vendita',
  'f_data vendita',
  'f_offerta minima',
  'f_rialzo minimo',
  'f_Spesa prenota debito',
  'f_Contributo non dovuto',
  'f_Tribunale',
  'f_termine presentazione',
  'f_lotto numero',
  'f_Deposito cauzionale',
  'f_luogo vendita',
  'f_Luogo presentazione',
  'f_categoria',
  'f_Procedura',
  'f_numero procedura'
]

df_ads_single_columns = df_ads_single.drop(columns_useless, axis=1)

## Print DataFrame Columns without useless Colums

In [None]:
columns = list(df_ads_single_columns.columns.values)
for column in columns:
  print(column)

## Check duplicates absence

In [None]:
print(df_ads_single_columns['url'].duplicated().any())
print(df_ads_single_columns['hashcode'].duplicated().any())

## Get Geographic Coordinates from Ads Addresses

### Install essentials packages for Geopandas

In [None]:
# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal 

# Install rtree - Geopandas requirment
!apt install python3-rtree

# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git

# Install descartes - Geopandas requirment
!pip install descartes

# Install Folium for Geographic data visualization
!pip install folium

# Install plotlyExpress
!pip install plotly_express

### Import Geopandas and Geopy

In [None]:
import numpy as np
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from shapely.geometry import Point
import matplotlib
import matplotlib.pyplot as plt 
import folium
import plotly_express as px
import folium
from folium.plugins import FastMarkerCluster

In [None]:
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode('Via Ferdinando Prampolini, 9, Ferrara')

print(location.address)
print("{}, {}".format(location.latitude, location.longitude))

# Artificial Neural Network