In [4]:
#import of all used libraries

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import json
import sys
import simplejson
import asyncio
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import urllib
import re

#bezreaWebScrape.py file contains class bezrea_Web from which individual functions are used to scrape content from web 
from bezreaWebScrape import bezrea_Web as bz

In [5]:
#connection set up to the url with links to all offers
from requests.exceptions import HTTPError

for url in ['https://www.bezrealitky.cz/api/record/markers?offerType=prodej&estateType=byt&locationInput=Praha']:
    try:
        response = requests.get(url)
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        print('Success!')

Success!


In [6]:
#saving response in json to a "urls" variable
urls = response.json()

In [7]:
#nicer form of json
wjdata = json.dumps(response.json(), indent=4, separators=(", ", " = "))

In [8]:
#saves this data in the api-output file
f = open("api-output.json","w")
f.write(wjdata)
f.close()

In [9]:
#sources all uri when an estate is "iDeveloper" type - links to offers offered by developer accounts
uriDeveloper = []
for element in urls:
    if element['type'] == 'iDeveloper':
        uriDeveloper.append(element['uri'])

In [10]:
#this is a nicer list just for demonstation
developerLinks = list(uriDeveloper)

In [11]:
#sources all uri when an estate is null type - links to private offers
uriPrivate = []
for element in urls:
    if element['type'] == '':
        uriPrivate.append(element['uri'])

In [12]:
#adds a url string before each row of the list and saves to a variable
privateLinks = list(map('https://www.bezrealitky.cz/nemovitosti-byty-domy/'.__add__,uriPrivate))

In [13]:
#take a small sample from privateLinks because of original sample being too large to process fast
test = privateLinks[:100]

In [15]:
#list of districts of offers
districts = []
for i in test:
    districts.append(bz.parseDistrict(i))
numDistricts = np.array(districts)

In [None]:
#list of regions of offers
regions = []
for i in test:
    regions.append(bz.parseRegion(i))
numRegions = np.array(regions)

In [None]:
#list of types of offers
types = []
for i in test:
    types.append(bz.parseType(i))
numTypes = np.array(types)

In [None]:
#class valuesPrcSize created in this document instead of bezreaWebScrape because of dificulties caused 
#by the use of "re" library in separate file
class valuesPrcSize:
    '''
    class of functions used to scrape Price, Size and Coordinates from links passed as parameters for functions
    '''
    def __init__(self,link,allowLog=True):
        self.allowLog = allowLog
        self.link = link
        r = requests.get(link)
        r.encoding='UTF-8'
        self.soup = BeautifulSoup(r.text,'lxml')
        if self.allowLog:
            print('Success!')
            
    def parsePrice(self):
        '''
        extraction of price of an offer as an integer
        uses link to a offer as "self" parametr
        '''
        pdTbl = pd.read_html(self,attrs= {"class":"table"})
        numbers = re.findall(r"\d+",pdTbl[0][1][4])
        price = int(''.join(numbers))
        return price
    
    def parseSize(self):
        '''
        extraction of size of an offer as an integer
        uses link to a offer as "self" parametr
        '''
        pdTbl = pd.read_html(self,attrs= {"class":"table"})
        size = int(re.findall(r"\d+",pdTbl[0][1][3])[0])
        return size
    
    def parseCoordinates(self):
        '''
        extraction of coordinates of an offer
        uses link to a offer as "self" parametr
        '''
        web = urllib.request.urlopen(self)
        soup = BeautifulSoup(web,'lxml')
        lat = '.'.join(re.findall(r"\d+",soup.findAll("iframe")[1]['src'].split("=")[1])[0:2])
        lng = '.'.join(re.findall(r"\d+",soup.findAll("iframe")[1]['src'].split("=")[1])[2:4])
        coord = [lat, lng]
        return coord

In [None]:
#list of coordinates
coordinates = []
for i in test:
    coordinates.append(valuesPrcSize.parseCoordinates(i))

In [None]:
#creation on numpy array of prices for all offers
prices = []
for i in test:
    prices.append(valuesPrcSize.parsePrice(i))
numPrices = np.array(prices)

In [None]:
#creation on numpy array of sizes for all offers
sizes = []
for i in test:
    sizes.append(valuesPrcSize.parseSize(i))
numSizes = np.array(sizes)

SIZE STATISTICS

In [None]:
#average size of offerings
np.average(numSizes)

In [None]:
#median size of offerings
np.median(numSizes)

In [None]:
#biggest size from offerings
np.max(numSizes)

In [None]:
#smallest size of a offer
np.min(numSizes)

In [None]:
#average price of offers
np.average(numPrices)

In [None]:
#median price of offers
str(np.median(numPrices)) + ' Kč'

In [None]:
#largest price from all offerings
str(np.max(numPrices)) + ' Kč'

In [None]:
#lowest price from all offerings
np.min(numPrices)

In [None]:
#standard deviaton of offers
np.std(numPrices)

In [None]:
PRICE PER METER SQUARED STATISTICS

In [None]:
#calculation of price per meter squared
pricePerMtr = np.divide(numPrices,numSizes)

In [None]:
#average price per meter
np.average(pricePerMtr)

In [None]:
#median of prices per meter
np.median(pricePerMtr)

In [None]:
#largest price per meter
np.max(pricePerMtr)

In [None]:
#Lowest price per meter
np.min(pricePerMtr)

In [None]:
#standard deviation of price per meter
np.std(pricePerMtr)

VISUALIZATIONS

visualiations of prices and sizes

In [None]:
#scatter plot showing the relations and distribution between price and sie of an offer
plt.scatter(numPrices,numSizes)

In [None]:
#bar graph of distribution of sizes of offers
sns.distplot(numSizes)
plt.show()

In [None]:
#bar graph of distribution of price per meter of offers
sns.distplot(pricePerMtr)

In [None]:
#combination of scatter plot and distributional bar graphs of prices and sizes of offers
sns.jointplot(numPrices,numSizes,size=(10))
plt.show()

VISUALIZATIONS of types and districts

In [None]:
#bar graph of count of different types of flats being offered
sns.countplot(types)

In [None]:
#bar graph of counf of offers in different regions
plt.figure(figsize=(28,14)) 
sns.countplot(regions)
plt.show()

In [None]:
#bar graph of counf of offers in different districts
plt.figure(figsize=(100,34)) 
sns.countplot(districts)
plt.show()
#unfortunatelly, the image cannot be well seen in Jupyter Notebook. We reccomend opening the image in a new tab

In [None]:
#boxplot for prices by types of offers
plt.figure(figsize=(15,10))
ax = sns.boxplot(types, prices)
ax.tick_params(labelsize=13)
plt.show()

In [None]:
#boxplot of prices per meter squared by districts
#again, we recommend opening the image in a new tab
plt.figure(figsize=(35,10))
ax = sns.boxplot(districts, pricePerMtr)
ax.set_xticklabels(ax.get_xticklabels(),rotation=50,ha="right")
ax.tick_params(labelsize=13)
plt.show()