In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
kozig_uri = u'https://kozigallas.gov.hu/pages/jobviewer.aspx?ID={}'
chrome_header = ('Mozilla/5.0 (Windows NT 6.2; Win64; x64) '
                 'AppleWebKit/537.36 (KHTML, like Gecko) '
                 'Chrome/32.0.1667.0 Safari/537.36')

In [None]:
def init_session():
    session = requests.Session()
    response = session.get('https://kozigallas.gov.hu/publicsearch.aspx',
                           headers={'User-agent': chrome_header})
    return session

In [None]:
def get_page(session, url):
    return session.get(url, headers={'User-agent': chrome_header})

In [None]:
def get_joblists(session):
    response = get_page(session, 'https://kozigallas.gov.hu/publicsearch.aspx')
    soup = BeautifulSoup(response.content, 'html.parser')
    
    max_page = soup.find('span', {'id': ('ctl00_ContentPlaceHolder1_'
                                         'JobSearchForm1_JobList1_lblPageCount2')})
    max_page = int(max_page.getText().split(':')[-1].strip())
    print('Found {} page.'.format(max_page))
    
    search_uri = u'https://kozigallas.gov.hu/publicsearch.aspx?p={}'
    return [search_uri.format(page) for page in range(1, max_page+1)]

In [None]:
def extract_jobs(response):
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('div', {'class': 'joblist'}).find('table')
    df = pd.read_html(table.prettify(), header=0)[0]
    df = df[df.columns[:-2]]
    df['job_id'] = [jid.get('href').split("'")[1] 
                    for jid in soup.findAll('a', {'class': 'jobapplication'})]
    return df

In [None]:
def read_job(session, job_id):
    print '.',
    response = session.get(kozig_uri.format(job_id))
    soup = BeautifulSoup(response.content, 'html.parser')
    tag = soup.find('p', {'class': 'MsoNormal'})
    return ' '.join([tag.getText().strip() for tag in soup.findAll('p', {'class': 'MsoNormal'})])

In [None]:
def scrape():
    with init_session() as session:
        print('joblist download')
        dfs = [extract_jobs(get_page(session, joblist))
               for joblist in get_joblists(session)]
        print('df concat')
        dfs = pd.concat(dfs)
        print('downloading {} jobs'.format(len(dfs)))
        dfs['text'] = dfs.apply(lambda row: read_job(session, row['job_id']),
                                axis=1)
        print('done.')
    return dfs

In [None]:
df = scrape()

In [None]:
df.to_csv('data/kozigallas20170727.csv', encoding='utf-8')
df.to_excel('data/kozigallas20170727.xlsx', encoding='utf-8')

---

# Processing data

In [None]:
import re
import pandas as pd
from collections import Counter

In [None]:
def clear(text):
    text = unicode(text)
    unwanted = ['\xa0', '\r', '\n']
    cleared = ''.join([char for char in text if char not in unwanted])
    return ' '.join(cleared.split())


def strip(text):
    text = text.lower()
    for char in ',.-':
        text = text.strip().strip(char)
    return text.strip()


def process(itemlist):
    items = itemlist.strip().split(u'•')
    return [strip(item) for item in items if item]


splitters = {
    u'feladat': u'A munkakörbe tartozó, illetve a vezetői megbízással járó lényeges feladatok:',
    u'illetmeny': u'Illetmény és juttatások:', 
    u'feltetel': u'Pályázati feltételek:',
    u'alt_palyazat': u'Általános pályázati feltételek:',
    u'tul_palyazat': u'Az általános pályázati feltételeken túl a pályázóval szembeni további követelmény:',
    u'elony':    u'A pályázat elbírálásánál előnyt jelent:',
    u'igazolas': u'A pályázat részeként benyújtandó iratok, igazolások:', 
    u'kompetencia': u'Előnyt jelentő kompetenciák:',
    u'idopont': u'A munkakör betölthetőségének időpontja:'
}


def generate_order(text):
    order = [(key, text.find(string)) for key, string in splitters.items() if not text.find(string) == -1]
    order = sorted(order, key=lambda x: x[1])
    return [key for key, index in order]
    
    
def extract(text):
    text = clear(text)
    order = generate_order(text)
    data = {key: '' for key, string in splitters.items() 
            if key not in order}

    for start, end in zip(order, order[1:]):
        selected = text.split(splitters[start])
        item = selected[-1].split(splitters[end])
            
        data[start] = process(item[0])
    
    return data


def unpack(df, column, fillna=None):
    ret = pd.DataFrame((d for idx, d in df[column].iteritems()))
    if fillna is not None:
        ret = ret.fillna(fillna)
    return pd.concat([df, ret], axis=1)

In [None]:
data = pd.read_excel('C:\workspace\data\kozigallas20170525.xlsx')

In [None]:
data['extracted'] = data.text.apply(extract)

## Top values

In [None]:
def flatten(items):
    if items:
        items = sum([item.split(',') for item in items], [])
        return [item.strip() for item in items]
    return ''


def get_top_values(df, column, n=100):
    occurences = Counter(sum([flatten(items) for items in df[column].values if items], []))
    return occurences.most_common(n)


def isonkormanyzat(text):
    for name in [u'önkormányzat', u'polgármesteri hivatal']:
        if name in text.lower():
            return True
    return False


def clear_job(text):
    text = re.sub(r'[0-9][0-9]+.', u'', text)
    text = text.replace(u'1 fő', u'')
    return text.lower().strip()

In [None]:
preprocessed = unpack(data.reset_index(), 'extracted', '')
onkormanyzat = preprocessed.loc[preprocessed[u'Közzétevő'].fillna('').map(isonkormanyzat)]
top100 = get_top_values(onkormanyzat, 'kompetencia')

In [None]:
(pd.DataFrame(top100, columns=['kompetencia', 'szamossag'])
   .to_excel('C:\workspace\data\onkormanyzat_kompetencia_20170525.xlsx'))

### Job title cleaning

In [None]:
onkormanyzat['cleaned_job'] = onkormanyzat[u'Állás megnevezése'].map(clear_job)

In [None]:
(pd.DataFrame(Counter(onkormanyzat.cleaned_job.values).most_common(), columns=['munkakor', 'szamossag'])
   .to_excel('C:\workspace\data\onkormanyzat_munkakor_20170525.xlsx'))