Import needed packages

In [8]:
import requests
import pandas as pd
import geopy
import time
import folium
import docx
import unicodedata
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
from pathlib import Path

Import some data for testing

In [2]:
text_icos = open('examples/top100_icos.txt', 'r')
icos = [line.strip() for line in text_icos]
text_icos.close()

print(icos)

['00177041', '28356250', '45274649', '26185610', '25938002', '61672190', '27773035', '28477090', '63474808', '45788235', '60193531', '43872247', '49450301', '00014915', '70994226', '63080737', '26513528', '18050646', '64945880', '26463318', '49903209', '60193336', '27082440', '26450691', '49241214', '64949681', '29259428', '00268577', '45359326', '25654012', '25702556', '61672599', '48171131', '63473291', '00000931', '46995129', '26919389', '27386732', '27636801', '46678735', '15890554', '60193913', '27214265', '02176475', '00005886', '04084063', '14707420', '03592880', '01759299', '25663135', '48365289', '46711953', '05735025', '45357366', '25029673', '14803534', '47114983', '48038687', '29060770', '14706725', '14803534', '25077830', '49240030', '60193492', '27295567', '28987322', '60838744', '00534111', '26271303', '41189671', '14888742', '26128209', '26455137', '25133152', '28196678', '45193410', '28234642', '45274924', '26440181', '64833054', '28244532', '43005012', '26161516', '45

## Functions to obtain the HTML

In [15]:
def get_subjektID(ico):
    url_0 = f"https://or.justice.cz/ias/ui/rejstrik-$firma?ico={ico}&jenPlatne=PLATNE&polozek=1&typHledani=STARTS_WITH"
    html_subjektID = requests.get(url_0).content
    soupID = BeautifulSoup(html_subjektID, 'lxml', from_encoding="utf-8")
    vypis = str(soupID.find(href=True, text='Výpis platných'))
    subjektID = vypis.split('subjektId=')[1].split('&')[0]
    return subjektID
   
def get_url(subjektID):
    url = f'https://or.justice.cz/ias/ui/rejstrik-firma.vysledky?subjektId={subjektID}&typ=PLATNY'
    return url

def get_soup(subjektID):
    url = get_url(subjektID)
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'lxml', from_encoding="utf-8")
    return soup

## The function getBasicInfo obtains basic 5 datapoints that are the same across all companies on justice.cz

In [10]:
def getBasicInfo(icos):
    df = pd.DataFrame(index=icos, columns = ['Název společnosti', 'Datum vzniku', 'Spisová značka', 'Sídlo', 'Právní forma'])
    for ico in icos:
        soup = get_soup(ico)

        date = soup.find(text="Datum vzniku a zápisu:").findNext('div').findNext('div').text
        df.loc[ico,'Datum vzniku'] = date

        spis = soup.find(text="Spisová značka: ").findNext('span').text
        df.loc[ico,'Spisová značka'] = spis

        nazev = soup.find(class_="nounderline").findNext(class_="nounderline").findNext('span').text    # here we take different approach as the object is sometimes Obchodní firma and sometimes different
        df.loc[ico,'Název společnosti'] = nazev

        sidlo = soup.find(text="Sídlo: ").findNext('span').findNext('span').text                        # address is in double span
        df.loc[ico,'Sídlo'] = sidlo

        pravni_forma = soup.find(text="Právní forma: ").findNext('span').text
        df.loc[ico,'Právní forma'] = pravni_forma
        time.sleep(1)   # added 1 second sleep as justice temporarily banned access for my IP, maybe this will help
    return df

df = getBasicInfo(icos)
df

NameError: name 'icos' is not defined

In [11]:
def getCoordinates(df):
    ### FUNCTION TO OBTAIN GPS COORDINATES FROM ADDRES ###
    locator = Nominatim(user_agent='justice_scraper')
    list_address = df.loc[:,'Sídlo'].tolist()
    latitudes = []
    longitudes = []
    for i in list_address:
        split = i.split(',')                        # first we try to obtain coordinates from the first 2 splits in the addres, usually the street name and number + city
        address = split[0]+', '+split[1]
        location = locator.geocode(address)
        if not location:                            # sometimes the address is wierdly formatted and the second info is the postal code, which results in not finding any coordinates, we then try only from the first split
            split1 = i.split(',')
            address1 = split1[0]
            lat1 = locator.geocode(address1).latitude
            lon1 = locator.geocode(address1).longitude
            latitudes.append(lat1)
            longitudes.append(lon1)
        else: 
            lat = locator.geocode(address).latitude
            lon = locator.geocode(address).longitude
            latitudes.append(lat)
            longitudes.append(lon)
    df['latitude'] = latitudes
    df['longitude'] = longitudes
    return df

df = getCoordinates(df)
df

NameError: name 'df' is not defined

In [30]:
def getMap(df):
    map1 = folium.Map(
        location=[49.861464, 15.496766],
        tiles='cartodbpositron',
        zoom_start=8,
    )
    df.apply(lambda row:folium.Marker(location=[row["latitude"], row["longitude"]], popup=row['Název společnosti']).add_to(map1), axis=1)
    map1.save(outfile= "examples/map.html")

getMap(df)

## Obtaining more complex info that differs across companies

In [16]:
subjektID = get_subjektID(ico)

def get_info(subjektID):
    soup = get_soup(subjektID)
    data = {}
    # first getting the basic information which will be formatted differently
    nazev = soup.find(class_="nounderline").findNext(class_="nounderline").findNext('span').text    #here we take different approach as the object is sometimes Obchodní firma and sometimes different
    data['Název společnosti:'] = nazev

    date = soup.find(text="Datum vzniku a zápisu:").findNext('div').findNext('div').text
    data['Datum vzniku:'] = date

    spis = soup.find(text="Spisová značka: ").findNext('span').text
    data['Spisová značka:'] = spis

    sidlo = soup.find(text="Sídlo: ").findNext('span').findNext('span').text                      #address is in double span
    data['Sídlo:'] = sidlo

    data['IČO:'] = str(ico)

    pravni_forma = soup.find(text="Právní forma: ").findNext('span').text
    data['Právní forma:'] = pravni_forma

    vr_childs = soup.find(text="Právní forma: ").find_all_next(class_='vr-child')  #now the functions looks for all other types of information
    while vr_childs[0].find(class_='nounderline').text == '':    #this part makes sure the code will not get stuck on some companies, which have additional info bellow právní forma, i.e. 45359326 
        vr_childs = vr_childs[1:]
    keys = []
    space = '    '    #this will create indentation later on for subcategories
    for child in vr_childs:
        test = child.find(class_='aunp-udajPanel') #check if there is any information available, sometimes there are empty childs, the code will skip them
        if test is None:
            continue
        all_spans = child.find(class_='aunp-udajPanel').findAll('span')
        parents_vrchild =  child.find_parents(class_='vr-child') 
        spans = []
        for span in all_spans: #getting all text spans that are not child of any other span
            span_child = span.findChildren()
            if span_child:
                pass
            else:
                if span.text.strip(): 
                    spans.append(span)
                else: pass
        first_span = spans[0]
        try:
            if first_span['class'][0] == 'nounderline':
                key = len(parents_vrchild)*space + first_span.text.strip()
                while key in keys:
                    key = key + '+'
                spans = spans[1:]
            else: key = keys[-1]
        except: key = keys[-1]
        i = 0  #set an index which helps keep track of the spans used in the next function
        info = []
        while i < len(spans):  #this loop makes sure, that the information is kept within the same line as shown on justice.cz
            span = spans[i]
            text = span.text
            try:
                span_next = spans[i+1]
            except:
                info.append(unicodedata.normalize("NFKD",text))
                break
            if span.parent == span_next.parent:
                while span.parent == span_next.parent:
                    if str(span.next.next) == '<br/>':   #checks if there are no line breaks inserted
                        break
                    text = text + span_next.text
                    i += 1
                    span = spans[i]
                    try:
                        span_next = spans[i+1]
                    except:
                        break
                info.append(unicodedata.normalize("NFKD",text))
                i += 1
            else:
                info.append(unicodedata.normalize("NFKD",text))
                i += 1
        if key in keys:
            data[key].extend(info)
        else:
            data[key] = info
        keys.append(key)
    return data

In [None]:
def test_info(icos):
    for ico in icos:
        print(ico)
        get_info(ico)
        time.sleep(1)

# test_info(icos)

## Gettting docx output

In [19]:
def get_vypis_doc(subjektID):
    subjektID = get_subjektID(ico)
    data = get_info(ico)
    keys = list(data.keys())

    #define document and styles settings
    document = Document()

    style = document.styles['Normal']
    font = style.font
    font.name = 'Calibri'
    font.size = Pt(11)
    style.paragraph_format.first_line_indent = Inches(-1.4)
    style.paragraph_format.left_indent = Inches(1.4)
    style.paragraph_format.space_after = Inches(0)
    style.paragraph_format.space_before = Inches(0)

    styles = document.styles
    style1 = styles.add_style('Light', WD_STYLE_TYPE.CHARACTER)
    font1 = style1.font
    font1.name = 'Calibri Light'
    font1.size = Pt(11)

    #get the basic info, which is displayed on the same line:
    #název společnosti
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[0]).bold=True
    p.add_run('\t'+data[keys[0]]).bold=True
    #datum vzniku
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[1]).bold=True
    p.add_run('\t'+data[keys[1]], style=document.styles['Light'])
    #spisová značka
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[2]).bold=True
    p.add_run('\t'+data[keys[2]], style=document.styles['Light'])
    #sídlo
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[3]).bold=True
    p.add_run('\t'+data[keys[3]], style=document.styles['Light'])
    #ičo
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[4]).bold=True
    p.add_run('\t'+data[keys[4]], style=document.styles['Light'])
    #právní forma
    p = document.add_paragraph(style=document.styles['Normal'])
    p.add_run(keys[5]).bold=True
    p.add_run('\t'+data[keys[5]], style=document.styles['Light'])
    #get the rest of the information
    if 'Ostatní skutečnosti:' in keys:  #not interested in 'ostatní skutečnosti'
        position = keys.index('Ostatní skutečnosti:')
        keys = keys[:position]
    else: pass
    for key in keys[6:]: 
        p = document.add_paragraph(style=document.styles['Normal'])
        p.add_run(key.replace('+','')).bold=True
        values = data[key]
        for value in values:
            p.add_run('\n'+value, style=document.styles['Light'])

    #document.save('examples/docs/'+f'výpis_{data[keys[0]]}_.docx')
    document.save(f'výpis_{data[keys[0]]}_.docx')
    pdf_url = f'https://or.justice.cz/ias/ui/print-pdf?subjektId={subjektID}&typVypisu=PLATNY&full=false'
    pdf_filename = f'pdf_výpis_{data[keys[0]]}_.pdf'
    urllib.request.urlretrieve(pdf_url, pdf_filename)

In [None]:
def test_docs(icos):
    i=1
    for ico in icos:
        print(f'{ico}+{i}')
        i += 1
        get_vypis_doc(ico)
        time.sleep(1)

# test_docs(icos)

## PDF file download


In [6]:
import urllib.request
urllib.request.urlretrieve('https://or.justice.cz/ias/ui/print-pdf?subjektId=215087&typVypisu=PLATNY&full=false', "filename.pdf")

def getPDF(subjektID):
    url = f'https://or.justice.cz/ias/ui/print-pdf?subjektId={subjektID}&typVypisu=PLATNY&full=false'
    pdf_filename = nazev_spol
    urllib.request.urlretrieve(url,pdf_filename)

('filename.pdf', <http.client.HTTPMessage at 0x14940335f10>)

## Unused code bellow


In [None]:
def getObjects(soup):
    object_list=[]
    for article in soup.findAll(class_="nounderline"):
        article_text = article.text
        if not article_text:
            pass
        else: 
            object_list.append(article)
    return object_list

def getTextObjects(list):
    text_obj_list = []
    for i in list:
        text = i.text.strip()
        text_obj_list.append(text)
    return text_obj_list

In [None]:
# older approach to get the additional info
def getInfo(ico):
    soup = get_soup(ico)
    data = {}
    # first getting the basic information which will be formatted differently
    nazev = soup.find(class_="nounderline").findNext(class_="nounderline").findNext('span').text    #here we take different approach as the object is sometimes Obchodní firma and sometimes different
    data['Název společnosti:'] = nazev

    date = soup.find(text="Datum vzniku a zápisu:").findNext('div').findNext('div').text
    data['Datum vzniku:'] = date

    spis = soup.find(text="Spisová značka: ").findNext('span').text
    data['Spisová značka:'] = spis

    sidlo = soup.find(text="Sídlo: ").findNext('span').findNext('span').text                      #address is in double span
    data['Sídlo:'] = sidlo

    data['IČO:'] = str(ico)

    pravni_forma = soup.find(text="Právní forma: ").findNext('span').text
    data['Právní forma:'] = pravni_forma
    
    # next get the rest of the information available, except the last one, which is ostatní skutečnosti
    list_objects = getObjects(soup)
    for o in list_objects[5:-1]:
        span = o.findNext('span')
        info = []
        while span not in list_objects:
            if span: #checks if span exists
                if span.parent.name == 'p': #break the loop after the last span of our interest
                    break
                else:
                    span_child = span.findChildren()
                    while span_child: #iterates until it hits the last child span
                        span = span.findNext('span')
                        span_child = span.findChildren()
                    if span.text.strip(): #if the span includes some text continues
                        span_parent = span.parent
                        span_next = span.findNext('span')
                        text = span.text
                        if span_next: #concatenating spans that fall under the same parent so they are shown on the same line
                            while span_parent == span_next.parent:
                                text = text + span_next.text
                                span_next = span_next.findNext('span')
                                span = span.findNext('span')
                        info.append(text)
                        span = span.findNext('span')
                    else: span = span.findNext('span')
            else: break
        key = o.text.strip()
        if key in data:  #checks if key already exists in dictionary, so it wont replace it
            key = key + '+'
        data[key] = info
    return data

data = getInfo(27082440) #45316872 svetozor
data