In [1]:
import requests
import pandas as pd
import sqlalchemy
from bs4 import BeautifulSoup
import re

### Web Scrapping

In [2]:
url = 'https://www.otodom.pl/sprzedaz/mieszkanie/warszawa/'
params = {'search[filter_float_price:to]':'400000',
          'search[filter_float_m:from]': '25', 
          'search[description]': '1', 
          'search[order]': 'created_at_first:desc',
          'search[dist]': '0',
          'search[subregion_id]': '197', 
          'search[city_id]': '26',
          'nrAdsPerPage': '72',
          'page': '1'}

In [3]:
r = requests.get(url, params=params)
r_html = r.text
soup = BeautifulSoup(r_html, 'html.parser')
r.url

'https://www.otodom.pl/sprzedaz/mieszkanie/warszawa/?search%5Bfilter_float_price%3Ato%5D=400000&search%5Bfilter_float_m%3Afrom%5D=25&search%5Bdescription%5D=1&search%5Border%5D=created_at_first%3Adesc&search%5Bdist%5D=0&search%5Bsubregion_id%5D=197&search%5Bcity_id%5D=26&nrAdsPerPage=72'

In [4]:
offer_items = soup.find_all('div','offer-item-details')

In [5]:
re_district = re.compile(r'(Mieszkanie na sprzedaż: Warszawa,) (.*)')
re_digits = re.compile(r'\s*(\d*\s*\d*)\s*(.*)')

In [6]:
re_digits.search('383 546,46 zł').group(1)

'383 546'

In [7]:
offers = dict()
for num, offer in enumerate(offer_items, start=1):
    title = offer.find('span', 'offer-item-title').text.strip()
    district = re_district.search(offer.find('p', 'text-nowrap hidden-xs').text).group(2).strip()
    room = re_digits.search(offer.find('li', 'offer-item-rooms hidden-xs').text).group(1).strip()
    price = re.sub(' ', '', re_digits.search(offer.find('li', 'offer-item-price').text).group(1)).strip()
    area = re.sub(' ', '', re_digits.search(offer.find('li', 'hidden-xs offer-item-area').text).group(1)).strip()
    price_per_m = re.sub(' ', '', re_digits.search(offer.find('li', 'hidden-xs offer-item-price-per-m').text).group(1)).strip()
    offers[num] = {'title': title, 'district': district, 'rooms': room, 'price': price, 'area': area,
                  'price_per_m': price_per_m}

In [8]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
import psycopg2
from sqlalchemy import Column, Integer, String, Date

### Data Base

In [23]:
engine = create_engine('postgresql+psycopg2://postgres:[password]@localhost:5432/postgres')

In [10]:
Base = declarative_base()

class Flat(Base):
    '''Create flat table structure'''
    __tablename__ = 'flats_warsaw'
    
    id = Column(Integer, primary_key=True)
    title = Column(String)
    district = Column(String)
    room = Column(Integer)
    price = Column(Integer)
    area = Column(Integer)
    price_per_m = Column(Integer)
    
    def __repr__(self):
        return f"<DataProcessSQL(id={self.id}, title={self.title}, district={self.room}, price={self.price}, " \
            "area={self.area}, price_per_m={self.price_per_m}>"

In [11]:
Base.metadata.create_all(engine)

In [20]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

In [21]:
for key, value in offers.items():
    flat = Flat(title=offers[key]['title'],
               district=offers[key]['district'],
               room=offers[key]['rooms'],
               price=offers[key]['price'],
               area=offers[key]['area'],
               price_per_m=offers[key]['price_per_m'])
    session.add(flat)
session.commit()
session.close()

In [22]:
engine.execute('''DELETE FROM flats_warsaw a
                USING flats_warsaw b
                WHERE a.id > b.id and a.title = b.title''')

<sqlalchemy.engine.result.ResultProxy at 0x7fe27c267e48>