In [9]:
%load_ext autoreload
%autoreload 2
import requests
import re
from datetime import datetime
from src.utils import set_driver, extract_data_from_url
import requests
from bs4 import BeautifulSoup

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
driver = set_driver()

In [2]:
base_url = "https://www.centris.ca/fr/"
url_bien = base_url + "triplex~a-vendre~montreal-rosemont-la-petite-patrie/26999986?view=Summary&uc=1"

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url_bien, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# inspect html
html_content = soup.prettify()
print(html_content)

In [11]:
centris_url_data = extract_data_from_url(url_bien)
print(centris_url_data)

{'ville': 'Montreal', 'quartier': 'Rosemont La Petite Patrie', 'centris_id': '26999986'}


In [None]:
# Data model to store the extracted information
bien_centrics = {
    'centris_id': None,
    'adresse': None,
    'ville': None,
    'quartier': None,
    'annee_construction': None,
    'superficie_habitable': None,
    'superficie_terrain': None,
    'prix': None,
    'eval_municipale': None,
    'taxes': None,
    'frais': 0,
    'garage': 0,
    'date_scrape': datetime.now().strftime('%Y-%m-%d'),
    'nombre_unites': None,
    'unites': [],
    'misc': None,
    'revenus_brut': None,
    'description': None
}

In [None]:
# data from URL
centris_url_data = extract_data_from_url(url_bien)
bien_centrics['centris_id'] = centris_url_data['centris_id']
bien_centrics['ville']= centris_url_data['ville']
bien_centrics['quartier'] = centris_url_data['quartier']

# data from HTML
price_span = soup.find('span', {'id': 'BuyPrice'})
if price_span:
    bien_centrics['prix'] = int(re.sub(r'[^0-9]', '', price_span.string))

# Extracting description
description_div = soup.find('div', {'itemprop': 'description'})
if description_div:
    bien_centrics['description'] = description_div.string.strip()

# Extracting address
address_h2 = soup.find('h2', {'itemprop': 'address'})
if address_h2:
    bien_centrics['adresse'] = address_h2.string.strip()

# Extracting construction year
construction_div = soup.find('div', string='Année de construction')
if construction_div and construction_div.find_next_sibling('div', class_='carac-value'):
    bien_centrics['annee_construction'] = int(construction_div.find_next_sibling('div', class_='carac-value').text.strip())

# Extracting land area
land_area_div = soup.find('div', string='Superficie du terrain')
if land_area_div and land_area_div.find_next_sibling('div', class_='carac-value'):
    land_area_text = land_area_div.find_next_sibling('div', class_='carac-value').string.strip()
    bien_centrics['superficie_terrain'] = int(re.sub(r'[^0-9]', '', land_area_text))

# Extracting garage
garage_div = soup.find('div', string='Stationnement total')
if garage_div and garage_div.find_next_sibling('div', class_='carac-value'):
    garage_text = garage_div.find_next_sibling('div', class_='carac-value').string.strip()
    match = re.search(r'Garage \((\d+)\)', garage_text)
    if match:
        bien_centrics['garage'] = int(match.group(1))

# Extracting number of units
units_div = soup.find('div', string='Nombre d’unités')
if units_div and units_div.find_next_sibling('div', class_='carac-value'):
    units_text = units_div.find_next_sibling('div', class_='carac-value').string.strip()
    match = re.search(r'\((\d+)\)', units_text)
    if match:
        bien_centrics['nombre_unites'] = int(match.group(1))

# Extracting residential units
residential_units_div = soup.find('div', class_='carac-title', string=lambda s: s and 'Unités résidentielles' in s)
if residential_units_div and residential_units_div.find_next_sibling('div', class_='carac-value'):
    units_text = residential_units_div.find_next_sibling('div', class_='carac-value').find('span', {'data-id': 'NbUniteFormatted'}).text.strip()
    bien_centrics['unites'] = [unit.strip() for unit in units_text.split(',')]

# Extracting municipal evaluation
eval_total_div = soup.find('tr', class_='financial-details-table-total')
if eval_total_div and eval_total_div.find('td', class_='font-weight-bold text-right'):
    bien_centrics['eval_municipale'] = int(re.sub(r'[^0-9]', '', eval_total_div.find('td', class_='font-weight-bold text-right').text.strip()))

# Extracting taxes
yearly_taxes_div = soup.find('div', class_='financial-details-table-yearly')
taxes_total_div = yearly_taxes_div.find_next('tfoot').find('tr', class_='col pl-0 financial-details-table-total')
if taxes_total_div and taxes_total_div.find('td', class_='font-weight-bold text-right'):
    bien_centrics['taxes'] = int(re.sub(r'[^0-9]', '', taxes_total_div.find('td', class_='font-weight-bold text-right').string.strip()))

# Extracting potential gross revenue
revenus_div = soup.find('div', string='Revenus bruts potentiels')
if revenus_div and revenus_div.find_next_sibling('div', class_='carac-value'):
    revenus_text = revenus_div.find_next_sibling('div', class_='carac-value').string.strip()
    bien_centrics['revenus_brut'] = int(re.sub(r'[^0-9]', '', revenus_text))

# Extracting additional characteristics
misc_div = soup.find('div', string='Caractéristiques additionnelles')
if misc_div and misc_div.find_next_sibling('div', class_='carac-value'):
    bien_centrics['misc'] = misc_div.find_next_sibling('div', class_='carac-value').string.strip()

# Output the extracted data
for key, value in bien_centrics.items():
    print(f"{key}: {value}")

centris_id: 26999986
adresse: 4007 - 4011, boulevard Rosemont, Montréal (Rosemont/La Petite-Patrie), Quartier Rosemont Nord
ville: Montreal
quartier: Rosemont-La Petite-Patrie
annee_construction: 1959
superficie_habitable: None
superficie_terrain: 2400
prix: 899000
eval_municipale: 759100
taxes: 5450
frais: 0
garage: 1
date_scrape: 2024-12-02
nombre_unites: 4
unites: ['3 x 3 ½', '1 x 5 ½']
misc: À proximité du REM
revenus_brut: 51240
description: Emplacement de choix pour ce magnifique triplex au coeur de Rosemont ! Idéal pour propriétaires occupants ou investisseurs, il est bien entretenu au fil des ans et offre des revenus intéressants. Profitez d'une grande cour, d'excellents locataires et de tous les services à proximité. Le sous-sol est entièrement aménagé en bachelor. Situé près du métro, du parc botanique, de la clinique médicale, des écoles, et d'attraits comme le Jardin Botanique, le Parc Olympique, et le Biodôme. Revenus locatifs bruts de 51 840 $. Ne manquez pas cette opport

In [None]:
units_residential_div = soup.find('div', class_='carac-title', string=lambda s: s and 'Unités résidentielles' in s)
if units_residential_div and units_residential_div.find_next_sibling('div', class_='carac-value'):
    units_text = units_residential_div.find_next_sibling('div', class_='carac-value').find('span', {'data-id': 'NbUniteFormatted'}).text.strip()
    print([unit.strip() for unit in units_text.split(',')])



4
['3 x 3 ½', '1 x 5 ½']


In [44]:
units_residential_div = soup.find('div', class_='carac-title', string=lambda t: t and 'Unités résidentielles' in t)
units_residential_div

<div class="carac-title">
Unités résidentielles                </div>

In [35]:
units_div = soup.find('div', string='Nombre d’unités')
if units_div and units_div.find_next_sibling('div', class_='carac-value'):
    units_text = units_div.find_next_sibling('div', class_='carac-value').string.strip()
    match = re.search(r'\((\d+)\)', units_text)
    if match:
        print(int(match.group(1)))



4


In [43]:
units_residential_div = soup.find('div', class_='carac-title', text=lambda t: t and 'Unités résidentielles' in t)
units_residential_div

  units_residential_div = soup.find('div', class_='carac-title', text=lambda t: t and 'Unités résidentielles' in t)


<div class="carac-title">
Unités résidentielles                </div>