# **Virtual-Appraiser**


La idea del algoritmo Virtual-Appraiser es la de emular la capacidad de búsqueda y análisis de información de los avaluadores inmobiliarios utilizando data histórica de ventas, datos abiertos y técnicas de web scrapping  basado en análisis de imágenes mediante técnicas de reconocimiento óptico de caracteres y de aprendizaje profundo (DeepLearning).

**Algoritmo**: *Virtual-Appraiser*

**Entrada**: dirección del inmueble (DI), Valor Compra M2 Inmueble (VCI), área (A), número de cuartos (NC), número de baños (NB), número de parqueadores (NG), número de pisos (NP), estrato (E), tipo de inmueble (TI), polígono de estaciones Transmilenio (PET), polígono de centros comerciales (PCC), polígono de parques (PP), polígono de CAIs (PC), dataset ventas de inmuebles (DVI), Página Web de Oferta de Inmuebles (PWOI).

**Salida**: Estimación de Precio M2 para Venta, PM2, ($/m2).


# Configuring environment

In [None]:
%tensorflow_version 1.x

!pip install keras==2.1.0
!pip show keras
!pip install keras==2.1.0
!pip show keras

!pip3 install geocoder

In [None]:
!sudo apt install tesseract-ocr libtesseract-dev libmagickwand-dev

In [None]:
!pip install pytesseract

# Importing Libs

In [4]:
import pandas as pd
import os
import geocoder
from zipfile import ZipFile
import cv2
from scipy import spatial

import sys
import random
import math
import re
import time

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import skimage
import glob

from random import sample
import numpy as np
from sklearn.utils import resample
%matplotlib inline 

import shutil

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

import tensorflow

import warnings
warnings.filterwarnings('ignore')

# Mounting G-Drive and Setting Data and Models path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
#Change this according to your path
dataPath = '/content/drive/MyDrive/INTERPRETABLE_MACHINE_LEARNING_PROJECTS/'

# Functions

In [7]:
# Need validations and too much work
def geodecoder(address):
  loc = geocoder.osm(address+', Bogotá, Colombia')
  lat, lon = loc.latlng
  return lat,lon

In [8]:
# Confidence intervals estimation
def bootstrap(dataset, confidence=0.95, iterations=10000,
              sample_size=1.0, statistic=np.mean):
    """
    Bootstrap the confidence intervals for a given sample of a population
    and a statistic.
    Args:
        dataset: A list of values, each a sample from an unknown population
        confidence: The confidence value (a float between 0 and 1.0)
        iterations: The number of iterations of resampling to perform
        sample_size: The sample size for each of the resampled (0 to 1.0
                     for 0 to 100% of the original data size)
        statistic: The statistic to use. This must be a function that accepts
                   a list of values and returns a single value.
    Returns:
        Returns the upper and lower values of the confidence interval.
    """
    stats = list()
    n_size = int(len(dataset) * sample_size)

    for _ in range(iterations):
        # Sample (with replacement) from the given dataset
        sample = resample(dataset, n_samples=n_size)
        # Calculate user-defined statistic and store it
        stat = statistic(sample)
        stats.append(stat)

    # Sort the array of per-sample statistics and cut off ends
    ostats = sorted(stats)
    lval = np.percentile(ostats, ((1 - confidence) / 2) * 100)
    uval = np.percentile(ostats, (confidence + ((1 - confidence) / 2)) * 100)

    return (lval, uval)

In [9]:
#Preprocessing Database: Naive proposals
def preprocessData(dfDVI):
  dfTrain = dfDVI.copy()
  #Drop unnecessary columns
  dfTrain = dfTrain[[val for val in dfTrain.columns.tolist() if not (val in ['id'])]]

  # This field should be fixed by questioning to data owners/generators
  print('Before')
  colfix = 'tiponegocio'
  print(dfTrain[colfix].unique())
  dfTrain[colfix] = dfTrain[colfix].fillna('Venta')
  dfTrain[colfix] = [val.lower() for val in dfTrain[colfix]]
  print('After')
  print(dfTrain[colfix].unique())

  # This field should be fixed by questioning to data owners/generators
  print('Before')
  colfix = 'piso'
  print(dfTrain[colfix].unique())
  dfTrain[colfix] = dfTrain[colfix].fillna(1.0)
  print('After')
  print(dfTrain[colfix].unique())

  # This field should be fixed by questioning to data owners/generators
  print('Before')
  colfix = 'banos'
  print(dfTrain[colfix].unique())
  dfTrain[colfix] = dfTrain[colfix].fillna(1.0)
  print('After')
  print(dfTrain[colfix].unique())

  # This field should be fixed by questioning to data owners/generators
  print('Before')
  colfix = 'habitaciones'
  print(dfTrain[colfix].unique())
  dfTrain[colfix] = dfTrain[colfix].fillna(1.0)
  print('After')
  print(dfTrain[colfix].unique())

  # We assume N.A. as no information
  print('Before')
  colfix = 'vista'
  print(dfTrain[colfix].unique())
  dfTrain[colfix] = dfTrain[colfix].fillna('N.A.')
  print('After')
  print(dfTrain[colfix].unique())

  # This field should be precised with house owners
  # This is a naive preprocessing
  print('Before')
  colfix = 'tiempodeconstruido'
  print(dfTrain[colfix].unique())
  dictFixBuiltTime=dict(zip([float('nan'), 'Entre 10 y 20 años', 'Entre 0 y 5 años',
                            'Entre 5 y 10 años', 'Más de 20 años', '1 a 8 años',
                            '16 a 30 años', '9 a 15 años', 'Más de 30 años',
                            'Menos de 1 año', 'Remodelado', 'ntre 0 y 5 años'],
                            ['0-5', '10-15', '0-5', '5-10', '20-25', '0-5', '15-20',
                            '10-15', '30-35', '0-5', 'Remodelado', '0-5']))
  dfTrain[colfix] = dfTrain[colfix].replace(dictFixBuiltTime)
  print('After')
  print(dfTrain[colfix].unique())

  # This field could be automatically filtered using polygons of neighboors in Bogotá
  # and using Point in Polygon
  print('Before')
  colfix = 'estrato'
  print(dfTrain[colfix].unique())
  print(dfTrain[colfix].value_counts())
  dfTrain[colfix] = dfTrain[colfix].fillna(0.0)
  print('After')
  print(dfTrain[colfix].unique())

  # This field needs to be created according to challenge
  dfTrain['PriceM2'] = [price/area if area > 0.0 else float('nan') for price, area in zip(dfTrain['valorventa'],dfTrain['area'])]

  # Drop observations where area is nan and fill all nan values with zeros. This is a very naive filtering :)
  dfTrain.dropna(subset=['area'],inplace=True)
  dfTrain.reset_index(inplace=True)
  dfTrain = dfTrain[[val for val in dfTrain.columns.tolist() if not (val in ['index'])]]
  dfTrain.fillna(0.0, inplace=True)
  dfDVI = dfTrain.copy()
  return dfDVI

In [None]:
%cd
fileName = os.path.join(dataPath,'model/VirtualAppraiser.zip')
ds = ZipFile(fileName)
ds.extractall()
print('Extracted zip file ' + fileName)

%cd ~/VirtualAppraiser
fileName = os.path.join(dataPath,'data/visualwebscrapping.zip')
os.makedirs('dataset')
os.chdir('dataset')
ds = ZipFile(fileName)
ds.extractall()
print('Extracted zip file ' + fileName)

%cd ~/VirtualAppraiser
!python setup.py install

##### Training Mask R-CNN (Deep Learning) for detecting offer cards in www.fincaraiz.com

In [None]:
#Activate only for training
'''
%cd ~/VirtualAppraiser
!python oferta.py train --dataset=dataset/ --weights=coco

#Change path of logs
shutil.copy('/logs/oferta20210413T2208/mask_rcnn_oferta_0030.h5',os.path.join(dataPath,'OfertaDetector.h5'))
'''

##### Inference Mask R-CNN (Deep Learning) for detecting offer cards in www.fincaraiz.com

In [None]:
%cd ~/VirtualAppraiser

from mrcnn import utils
from mrcnn import visualize
from mrcnn.visualize import display_images
import mrcnn.model as modellib
from mrcnn.model import log
import oferta

# Root directory of the project
ROOT_DIR = os.getcwd()

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library
OfertaDetector = 'OfertaDetector.h5'
custom_WEIGHTS_PATH = os.path.join(dataPath, 'model/'+OfertaDetector)


# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

config = oferta.OfertaConfig()
custom_DIR = os.path.join(ROOT_DIR, "dataset")

class InferenceConfig(config.__class__):
    # Run detection on one image at a time
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

config = InferenceConfig()
config.display()

# Device to load the neural network on.
# Useful if you're training a model on the same 
# machine, in which case use CPU and leave the
# GPU for training.
DEVICE = "/gpu:0"  # /cpu:0 or /gpu:0

# Inspect the model in training or inference modes
# values: 'inference' or 'training'
# TODO: code for 'training' test mode not ready yet
TEST_MODE = "inference"
# Create model in inference mode
with tensorflow.device(DEVICE):
    model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR,
                              config=config)

# Load weights
print("Loading weights ", custom_WEIGHTS_PATH)

model.load_weights(custom_WEIGHTS_PATH, by_name=True)

from importlib import reload # was constantly changin the visualization, so I decided to reload it instead of notebook
reload(visualize)

In [13]:
def ofertaM2Image(frame, model):
  # Get ROIS using Mask RCNN
  resultsOfertaDetector = model.detect([frame], verbose=0)
  potentialcards = resultsOfertaDetector[0]
  rois = potentialcards['rois']
  scores = potentialcards['scores']
  class_ids = potentialcards['class_ids']
  image_np = frame.copy()

  texOCR = []
  # For every ROI make OCR
  for roi in rois:
    ymin, xmin, ymax, xmax = round(roi[0]), round(roi[1]), round(roi[2]), round(roi[3])
    image_np = cv2.rectangle(image_np, (xmin, ymin), (xmax, ymax), (255,0,0), 4)
    img_rgb = frame[ymin:ymax,xmin:xmax,:]
    texOCR.append(pytesseract.image_to_string(img_rgb))

  
  # Detect Area info by textual pattern
  AreaList = []
  for val in texOCR:
    x = re.findall(" m2", val) 
    startPos = val.find(x[0])
    lastPos = None
    i = startPos-1
    while i>0:
      if val[i] in [' ','\n']:
        lastPos = i
        break
      else:
        i = i - 1
    m2 = val[lastPos+1:startPos]
    AreaList.append(m2.replace(',','.'))

  # Detect Price info by textual pattern
  PrecioList = []
  for val in texOCR:
    x = re.findall("\$ ", val) 
    startPos = val.find(x[0])
    lastPos = None
    i = startPos+2

    while i<len(val):
      if val[i] in [' ','\n']:
        lastPos = i
        break
      else:
        i = i + 1
    precio = val[startPos:lastPos]
    precio = precio.replace('$ ','')
    precio = precio.replace('.','')
    PrecioList.append(precio)

  # Build $M2
  priceM2 = []
  if len(AreaList) == len(PrecioList):
    for val1,val2 in zip(AreaList,PrecioList):
      try:
        area = float(val1)
        price = float(val2)
        priceM2.append(price/area)
      except:
        print('Invalid values')
  return priceM2

In [14]:
def identificarInmSimilaresHist(lat, lon, dfDVI, A, NC,NB,NG,NP,E,TI, PET,PCC,PP,PC):
  PreciosM2InmSimilaresHistoricos = []
  patternReference = [lat,lon,A,NC,NB,NG,E]
  patternsMatch = dfDVI[['latitud','longitud','area','habitaciones','banos','garajes','estrato','PriceM2']]
  similarityMeasurement = []
  for i in range(len(patternsMatch)):
    patternMatch = patternsMatch.loc[i,['latitud','longitud','area','habitaciones','banos','garajes','estrato']].tolist()
    similarity = 1 - spatial.distance.cosine(patternReference, patternMatch)
    similarityMeasurement.append(similarity)
  patternsMatch['Similarity'] = similarityMeasurement
  patternsMatch = patternsMatch.sort_values(by=['Similarity'],ascending=False)
  PreciosM2InmSimilaresHistoricos = patternsMatch['PriceM2'].values.tolist()[:5]
  return PreciosM2InmSimilaresHistoricos

In [15]:
def identificarInmSimilaresOfertados(lat, lon, DVI, A, NC,NB,NG,NP,E,TI, PET,PCC,PP,PC):
  PreciosM2InmSimilaresOferta = []
  patternReference = [lat,lon,A,NC,NB,NG,E]
  #This would be captured by automatic screenshot of webpage using firefox python web driver
  frame = cv2.imread(os.path.join(dataPath,'data/Test.png'))
  #We need to improve our AI for getting more features
  PreciosM2InmSimilaresOferta = ofertaM2Image(frame, model)
  return PreciosM2InmSimilaresOferta

In [16]:
def obtnerIntervaloConfianzaM2(PreciosM2InmSimilaresHistoricos, PreciosM2InmSimilaresOferta, NivelConfianza=0.95):
  IntervaloConfianzaM2 = None
  data = PreciosM2InmSimilaresHistoricos + PreciosM2InmSimilaresOferta
  confidence = NivelConfianza
  iterations = 1000
  sample_size = 1.0
  statistic = np.mean
  lower, upper = bootstrap(data, confidence=confidence,
                              iterations=iterations,
                              sample_size=sample_size,
                              statistic=statistic)
  IntervaloConfianzaM2 = [lower,upper]
  return IntervaloConfianzaM2

# Input

In [17]:
DI = 'Cra. 4 # 22-61'         #dirección del inmueble (DI)
VCI = 4e6                     #Valor Compra M2 Inmueble (VCI)
A = 70                        #área (A)
NC = 3                        #número de cuartos (NC)
NB = 2                        #número de baños (NB)
NG = 1                        #número de parqueadores (NG)
NP = 1                        #número de pisos (NP)
E = 4                         #estrato (E) 
TI = 'casa'                   #tipo de inmueble (TI)
PET = None                    #polígono de estaciones Transmilenio (PET)
PCC = None                    #polígono de centros comerciales (PCC)
PP = None                     #polígono de parques (PP)
PC = None                     #polígono de CAIs (PC)
DVI = 'data/train_data.csv'   #dataset ventas de inmuebles (DVI) 
PWOI = 'www.fincaraiz.com'    #Página Web de Oferta de Inmuebles (PWOI)

In [18]:
ValorizacionAnual = 1.04

## 1. Geodecoding address

In [19]:
lat,lon = geodecoder(DI)
print(lat,lon)

4.5799733 -74.0826426


##2. Historical $M2 (Demand) of houses

In [None]:
dfDVI = pd.read_csv(os.path.join(dataPath,DVI),sep=',',decimal='.')
dfDVI = preprocessData(dfDVI)

In [21]:
PreciosM2InmSimilaresHistoricos = identificarInmSimilaresHist(lat, lon, dfDVI, A, NC,NB,NG,NP,E,TI, PET,PCC,PP,PC)
print(PreciosM2InmSimilaresHistoricos)

[3714285.714285714, 4285714.285714285, 4071428.5714285714, 4928571.428571428, 4204285.714285715]


##3. Current $M2 (Supply) of houses

In [22]:
PreciosM2InmSimilaresOferta = identificarInmSimilaresOfertados(lat, lon, PWOI, A, NC,NB,NG,NP,E,TI, PET,PCC,PP,PC)
print(PreciosM2InmSimilaresOferta)

[1833333.3333333333, 3600000.0, 3700000.0, 3525000.0]


##4. Confidence Interval

In [23]:
IntervaloConfianzaM2 = obtnerIntervaloConfianzaM2(PreciosM2InmSimilaresHistoricos, PreciosM2InmSimilaresOferta, NivelConfianza=0.95)
print(IntervaloConfianzaM2)

[3186061.5079365075, 4241684.523809523]


##5. Decision 

In [24]:
if VCI >= IntervaloConfianzaM2[0] and VCI <= IntervaloConfianzaM2[-1]:
  PM2 = IntervaloConfianzaM2[-1]
  print('VCI in confidence interval')
  print('Then PM2 = %d $/m2'%(round(PM2)))
else:
  PM2 = VCI * 1.03 * ValorizacionAnual
  print('VCI not in confidence interval')
  print('Then PM2 = %d $/m2'%(round(PM2)))

VCI in confidence interval
Then PM2 = 4241685 $/m2


# The END

### Notebook written by: 
* Anamaría Torres Orduz
* Alejandra Torres Orduz
* Carlos Diego Ferrin Bolaños

April, 2021
