<a href="https://colab.research.google.com/github/andreabazerla/real-estate/blob/main/Real_Estate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Prediction in Milan (Italy) through Deep Learning via immobiliare.it

<img src="https://media.giphy.com/media/gTURHJs4e2Ies/source.gif" />

# Web Scraping: immobiliare.it

## IP Address

In [None]:
print('Google Colab IP Address = ', end='')
!curl ipecho.net/plain

## Import

In [59]:
import os
import logging
import math
from google.colab import files
import requests
from enum import Enum 
from random import uniform
import time
import datetime
import json
from bs4 import BeautifulSoup
import pandas as pd

## Environment variables

In [26]:
PRODUCTION = True

## Pandas Options

In [60]:
pd.option_context('display.max_rows', None, 'display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Enums

In [27]:
class Contract(Enum):
  VENDITA = 'vendita'
  AFFITTO = 'affitto'
 
class Area(Enum):
  MILANO = 'milano'

## URL

In [28]:
slash = '/'
https = 'https://'
website = 'www.immobiliare.it'
contract = Contract.VENDITA.value + '-case'
area = Area.MILANO.value
sort = '?criterio=rilevanza'
 
url = https + website + slash + contract + slash + area + slash + sort
 
print('url = ' + url)

url = https://www.immobiliare.it/vendita-case/milano/?criterio=rilevanza


## Sleep

In [29]:
sleep_min = 1
sleep_max = 30
 
def get_timeout(min, max):
  return uniform(min, max)
 
def sleep_random_range(min, max, verbose):
  timeout = get_timeout(min, max)
  if (verbose): print('Sleep ' + str(int(timeout)) + 's...')
  time.sleep(timeout)
 
def sleep():
  sleep_random_range(sleep_min, sleep_max, False)
 
def get_sleep_list(number_pages):
  sleep_list = []
  while(number_pages):
    sleep_list.append(get_timeout(sleep_min, sleep_max))
    number_pages -= 1
  return sleep_list, sum(sleep_list)
 
def sleep_time(timeout):
  time.sleep(timeout)
 
def sec_to_time(seconds):
  return str(datetime.timedelta(seconds=seconds))

## Ads Link List

### Get Last Page

In [30]:
def get_last_page(url):
  sleep()
  
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
  
    ul_pagination = soup.find("ul", class_ = "pagination pagination__number")
    li_list = ul_pagination.find_all("li")
    last_page = int(li_list[-1].get_text().strip())
  
    return last_page
  
  except requests.exceptions.RequestException as e:
    raise SystemExit(e)

### Get Ads Links

In [31]:
def get_ads_link_list(url, first_page, last_page):
  ads_link_list = []
  
  pag = first_page  
  
  number_pages = last_page - first_page + 1
  
  sleep_list, sum_sleep_list = get_sleep_list(number_pages)
  print('Total sleep time = ' + sec_to_time(int(sum_sleep_list)))
 
  if (len(sleep_list) != number_pages):
    raise Exception("Sleep time list not equal to number of pages to analyse")
  
  idx = 0
  while (pag <= last_page):
    if (pag > 1):
      url = url + '&pag=' + str(pag)
    
    try:
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')
    
      ads_list = soup.find('ul', class_ = 'annunci-list')
      ad_item_list = ads_list.find_all('div', class_ = 'listing-item_body--content')
      for ad_item in ad_item_list:
        a_list = ad_item.find_all("a")
        for a in a_list:
          href = a["href"]
          ads_link_list.append(href)
    
    except:
      continue
    
    pag += 1
 
    #print('Sleep ' + str(int(sleep_list[idx])) + 's...')
    sleep_time(sleep_list[idx])
    idx += 1
  
  return ads_link_list

In [32]:
if PRODUCTION:
  first_page = 1
  last_page = 3
  #last_page = get_last_page(url)
  ads_link_list = get_ads_link_list(url, first_page, last_page)
  ads_link_list = list(dict.fromkeys(ads_link_list))
  print('Total number of ads = ' + str(len(ads_link_list)))

Total sleep time = 0:00:50
Total number of ads = 75


### Print Ads Links

In [None]:
for idx, ad_link in enumerate(ads_link_list, start=1):
    print(str(idx) + '\t' + ad_link)

### Store Ads Links to CSV

In [48]:
df_links = pd.DataFrame({'Links' : list(ads_link_list)})

csv_links = 'Links_' + str(int(time.time())) + '_' + str(first_page) + '_' + str(last_page) + '.csv'
df_links.to_csv(csv_links, index=False)

### Read Ads Links from CSV

In [54]:
df_links = pd.read_csv(csv_links)
ads_link_list = df_links['Links'].to_list()

#### Print Ads Links

In [None]:
for idx, item in enumerate(ads_link_list, start=1):
    print(str(idx) + '\t' + item)

### Display Links CSV

In [61]:
display(df_links)

Unnamed: 0,Links
0,https://www.immobiliare.it/annunci/p-159766/
1,https://www.immobiliare.it/annunci/68088357/
2,https://www.immobiliare.it/annunci/84184532/
3,https://www.immobiliare.it/annunci/p-157721/
4,https://www.immobiliare.it/annunci/85213731/
...,...
70,https://www.immobiliare.it/annunci/84904534/
71,https://www.immobiliare.it/annunci/83378181/
72,https://www.immobiliare.it/annunci/82445950/
73,https://www.immobiliare.it/annunci/86790348/


### Download Link CSV

In [None]:
files.download(csv_links)

## Ad

### Ad Title

In [34]:
def get_ad_title(soup):
  titleBlock__title = soup.find('span', class_ = 'im-titleBlock__title')
  return titleBlock__title.get_text()

### Ad Price

In [35]:
def get_ad_price(soup):
  mainFeatures__price = soup.find_all('li', class_ = 'im-mainFeatures__price')
  return mainFeatures__price[0].get_text().replace('\n', '').strip()

### Ad Main Features

In [36]:
def get_ad_main_feature(soup):
  main_features = {}
  
  mainFeatures = soup.find('div', class_ = 'im-mainFeatures')
  
  li_list = mainFeatures.find_all('li')
  for li in li_list[1:]:
    value = li.find('span', class_="im-mainFeatures__value").get_text().replace('\n', '').strip()
    label = li.find('span', class_="im-mainFeatures__label").get_text().replace('\n', '').strip()
    
    if (label == 'bagno' or label == 'bagni'):
      label = 'bagni'
    
    if (label == 'locale' or label == 'locali'):
      label = 'locali'
    
    main_features[label] = value
  
  return main_features

### Ad Description

In [37]:
def get_ad_description(soup):
  description__text = soup.find('div', class_ = 'im-description__text')
  return description__text.get_text()

### Ad Locations

In [38]:
def get_ad_locations(soup):
  location_list = []
  
  titleBlock__link = soup.find('a', class_ = 'im-titleBlock__link')
  location = titleBlock__link.find_all('span', class_ = 'im-location')
  
  try:
    area = location[0].get_text().strip()
  except IndexError:
    area = ''
  
  try:
    district = location[1].get_text().strip()
  except IndexError:
    district = ''

  try:
    address = location[2].get_text().strip()
  except IndexError:
    address = ''

  return [area, district, address]

### Ad Feature List

In [39]:
def get_ad_feature_list(soup):
  features = {}
  
  features__list = soup.find_all("dl", class_ = "im-features__list")
  
  for feature_block in features__list:
    feature__title_list = feature_block.find_all('dt', class_ = 'im-features__title')
  
    for feature__title in feature__title_list:
      feature__value = feature__title.findNext('dd')
  
      if ('im-features__tagContainer' in feature__value.get('class')):
        features__tag_array = []

        features__tag_list = soup.find_all('span', class_ = 'im-features__tag')
        for feature__tag in features__tag_list:
          features__tag_array.append(feature__tag.get_text().strip())
  
        features__tag_list_string = ','.join(features__tag_array)
        feature__value_2 = features__tag_list_string
  
      else:
        feature__value_2 = feature__value.get_text().strip()
  
      feature__title_2 = feature__title.get_text().strip()
      features['f_' + feature__title_2] = feature__value_2
  
  return features

### Ad Type

In [40]:
def get_ad(url):
  if 'p-' in url:
    return get_ad_multi(url)
  else:
    return get_ad_single(url)

### Ad Single

In [45]:
def get_ad_single(url):
  ads_list = []
  ad_data = {}

  ad_data['url'] = url

  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    title = get_ad_title(soup);
    ad_data['titolo'] = title

    price = get_ad_price(soup);
    ad_data['prezzo'] = price

    main_features = get_ad_main_feature(soup)
    if main_features:
      ad_data.update(main_features)

    description = get_ad_description(soup);
    ad_data['descrizione'] = description

    area, district, address = get_ad_locations(soup)
    ad_data['area'] = area
    ad_data['quartiere'] = district
    ad_data['indirizzo'] = address

    feature_list = get_ad_feature_list(soup)
    if feature_list:
      ad_data.update(feature_list)

    ad_data['hashcode'] = hash(frozenset(ad_data.items()))

    ads_list.append(ad_data)
  
  except Exception as e:
    logging.exception(' '.join(url, e))
    pass

  return ads_list

### Ad Multi

In [42]:
def get_ad_multi(url):
  ads_list = []

  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = get_ad_title(soup);

    area, district, address = get_ad_locations(soup)

    main_features = get_ad_main_feature(soup)

    description = get_ad_description(soup)

    feature_list = get_ad_feature_list(soup)

    properties__list = soup.find('ul', class_ = 'im-properties__list')
    properties__item_list = properties__list.find_all('li', class_ = 'im-properties__item')
    for properties__item in properties__item_list:
      ad_data = {}

      ad_data['url'] = url

      ad_data['titolo'] = title

      ad_data['area'] = area
      ad_data['quartiere'] = district
      ad_data['indirizzo'] = address
      
      price = get_ad_price(properties__item)
      ad_data['prezzo'] = price

      ad_data['descrizione'] = description

      sub_features = get_ad_main_feature(properties__item)
      if sub_features:
        ad_data.update(sub_features)

      title_2 = properties__item.find('p', class_ = 'nd-mediaObject__title').get_text().strip()
      ad_data['titolo_2'] = title_2

      description_2 = properties__item.find('div', class_ = 'im-properties__content').get_text()
      ad_data['descrizione_2'] = description_2

      if feature_list:
        ad_data.update(feature_list)

      ad_data['hashcode'] = hash(frozenset(ad_data.items()))

      ads_list.append(ad_data)

  except Exception as e:
    logging.exception(e)
    pass
  
  return ads_list

### Ads Scraping

In [47]:
if PRODUCTION:
  df = pd.DataFrame()

  first_ad = 0
  last_ad = 15
  #last_ad = len(ads_link_list)

  number_ads = last_ad - first_ad
  block_size = 3
  number_files = math.ceil(number_ads / block_size)
  ads_rest = number_ads % block_size

  timestamp = str(int(time.time()))
  ads_folder = 'Ads_' + timestamp + '_' + str(first_ad) + '_' + str(last_ad)

  if not os.path.exists(ads_folder):
    os.makedirs(ads_folder)

  sleep_list, sum_sleep_list = get_sleep_list(number_ads)
  print('Total sleep time = ' + sec_to_time(int(sum_sleep_list)))
  if (len(sleep_list) != number_ads):
    raise Exception("Sleep time list not equal to number of ads to scrape")

  ads_list = []
  ads_list_block = []
  column_list = []
  index_file = 0
  for i in range(number_ads):

    ad_data = get_ad(ads_link_list[i])
    for ad in ad_data:
      ads_list.append(ad)
      ads_list_block.append(ad)

    if ((i > 0 and i % block_size == 0) or i == last_ad - 1):
      df_block = pd.DataFrame(ads_list_block)
      df_block.fillna('', inplace=True)
      csv_ads = 'Ads_' + timestamp
      df_block.to_csv(ads_folder + '/' + csv_ads + '_' + str(index_file) + '.csv', index=False)
      index_file += 1
      ads_list_block = []
    
    sleep_time(sleep_list[i])
  
  df = pd.DataFrame(ads_list)
  df.fillna('', inplace=True)
  df.to_csv('Ads_' + timestamp + '_' + str(first_ad) + '_' + str(last_ad - 1) + '.csv', index=False)

Total sleep time = 0:03:03


### Display Ads CSV

In [None]:
  display(df)

### Download Ads .zip

In [24]:
!zip -r {ads_folder}.zip {ads_folder}
files.download(ads_folder + '.zip')

  adding: Ads_1616767025_0_7/ (stored 0%)
  adding: Ads_1616767025_0_7/Ads_1616767025_6_9_1.csv (deflated 88%)
  adding: Ads_1616767025_0_7/Ads_1616767025_3_6_0.csv (deflated 76%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Artificial Neural Network