In [218]:
import requests
from lxml import html
import pandas as pd
import re
from datetime import datetime
import timeit

In [229]:
def clean_issue(x,y,z):
    try:
        output = re.sub(x,y,z)
    
    except TypeError:
        output = ''
    
    return output

def remove_text(x,y):
    try:
        output = re.sub(re.escape(x),'',y)
    
    except TypeError:
        output = ''
    
    return output

In [220]:
url = 'http://www.minhafp.gob.es/es-ES/Areas%20Tematicas/Contratacion/TACRC/Paginas/BuscadordeResoluciones.aspx?pagina=1'
page = requests.get(url)
tree = html.fromstring(page.content)
no_pages = tree.xpath('//*[@id="ctl00_SPWebPartManager1_g_93bc4c3a_0f69_4097_bed1_978c8b545335"]/div/div/div/p[1]/text()')
no_pages = int(re.sub(".*\\/","",no_pages[0]))
no_pages

592

In [224]:
resolution_number_all = []
size_all = []
date_all = []
issue_all = []
legislation_all = []
text_all = []

start = timeit.default_timer()

for i in range(1,no_pages+1):
    
    url = 'http://www.minhafp.gob.es/es-ES/Areas%20Tematicas/Contratacion/TACRC/Paginas/BuscadordeResoluciones.aspx?pagina='+str(i)
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    resolution_number = tree.xpath("//div[contains(@class, 'breaker_resultados')]/a/text()")
    resolution_number = [re.sub("\\s+","",item) for item in resolution_number]
    resolution_number_all = resolution_number_all + resolution_number
    
    size = tree.xpath("//ul[contains(@class, 'listaResultadoBusqueda')]/li/div[2]/text()")
    size = [re.sub("Kb.*","",item) for item in size]
    size = [re.sub(".*pdf","",item) for item in size]
    size = [re.sub(".*doc","",item) for item in size]
    size = [re.sub("\\s+","",item) for item in size]
    size = [re.sub("-","",item) for item in size]
    size = [re.sub(",",".",item) for item in size]
    size = [float(item) for item in size]
    size_all = size_all + size
    
    date = tree.xpath("//ul[contains(@class, 'listaResultadoBusqueda')]/li/div[4]/text()[2]")
    date = [re.sub("\\s+","",item) for item in date]
    date = [re.sub(":","",item) for item in date]
    date = [datetime.strptime(item, '%d/%m/%Y') for item in date]
    date_all = date_all + date
    
    issue = tree.xpath("//ul[contains(@class, 'listaResultadoBusqueda')]/li/div[6]/text()[2]")
    issue = [re.sub(",.*","",item) for item in issue]
    issue = [re.sub("\\s\\s+","",item) for item in issue]
    issue = [re.sub(":","",item) for item in issue]
    issue_all = issue_all + issue
    
    legislation = tree.xpath("//ul[contains(@class, 'listaResultadoBusqueda')]/li/div[6]/text()[2]")
    legislation = [re.sub("\\..*","",item) for item in legislation]
    legislation = [re.sub(".*,","",item) for item in legislation]
    legislation = [re.sub("\\s+","",item) for item in legislation]
    legislation = [re.sub(":","",item) for item in legislation]
    legislation_all = legislation_all + legislation
    
    text = tree.xpath("//ul[contains(@class, 'listaResultadoBusqueda')]/li/div[6]/text()[2]")
    text_all = text_all + text

    print(i)
    
    stop = timeit.default_timer()
    print(stop - start)/60
    print(stop - start)/i * no_pages/60
    
tribunals_dict = {'resolution_number':resolution_number_all,
                 'size': size_all,
                 'date': date_all,
                 'issue': issue_all,
                 'legislation': legislation_all,
                 'text': text_all}

tribunals_df = pd.DataFrame(tribunals_dict)

tribunals_df = tribunals_df[['date', 'issue', 'legislation','resolution_number', 'size', 'text']]
tribunals_df['issue'] = tribunals_df['issue'].map(lambda x: clean_issue('\r','',x))
tribunals_df['issue'] = tribunals_df['issue'].map(lambda x: clean_issue('\n','',x))

tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub(':\r\n\\s+','',x))
tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub('\r','',x))
tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub('\\s\\s+','',x))
tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub('\n','',x))

tribunals_df['text'] = tribunals_df.apply(lambda df: remove_text(df['issue'],df['text']),axis=1)

tribunals_df['legislation'] = pd.Categorical(tribunals_df['legislation'], 
                                             categories=['TRLCSP','LCSE','LCSPDS','UTE','TRCLSP','TRLCS','TRLCPS',
                                                        'TRLSCP','LCSP','SA','LCSEP','PN','RLCSP','SA(SEGITTUR)',
                                                        'TRLCSPE','TRLKCSP','SA(TRAGSA)','TRLCSPDS','LCSPD',
                                                        'LCSSPDS'], ordered=False)
tribunals_df['outcome'] = tribunals_df.apply(lambda df: remove_text(df['legislation'],df['text']),axis=1)
tribunals_df['outcome'] = tribunals_df['outcome'].map(lambda x: re.sub('^,\\s\\.\\s','',x))
tribunals_df['outcome'] = tribunals_df['outcome'].map(lambda x: re.sub('\\..*','',x))

tribunals_df['text'] = tribunals_df.apply(lambda df: remove_text(df['legislation'],df['text']),axis=1)
tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub('^,\\s\\.\\s','',x))
tribunals_df['text'] = tribunals_df.apply(lambda df: remove_text(df['outcome'],df['text']),axis=1)
tribunals_df['text'] = tribunals_df['text'].map(lambda x: re.sub('^\\.\\s','',x))

tribunals_df.loc[tribunals_df['text'].str.len() == 0, 'text'] = "NaN"
tribunals_df.loc[tribunals_df['outcome'].str.len() == 0, 'outcome'] = "NaN"

tribunals_df = tribunals_df[['date','size','issue','legislation','outcome','text']]

tribunals_df.to_csv("tribunals.csv", encoding="utf-8")

1
0.00815328359604
4.82674388885
2
0.017172118028
5.08294693629
3
0.0275522669156
5.43698067135
4
0.0376391688983
5.57059699694
5
0.0478644688924
5.66715311686
6
0.0582440336545
5.74674465391
7
0.0683151841164
5.77751271384
8
0.0755019664764
5.58714551926
9
0.0837866346041
5.51129863174
10
0.0940491159757
5.56770766576
11
0.104316198826
5.61410815499
12
0.110737733046
5.46306149695
13
0.119062832991
5.42193824083
14
0.129863631725
5.49137642724
15
0.139223384857
5.49468292236
16
0.150236952305
5.55876723528
17
0.160464966297
5.58795647341
18
0.170855915546
5.61926122242
19
0.177798648675
5.53983157978
20
0.184068067869
5.44841480891
21
0.193968232473
5.46805683923
22
0.201426184177
5.4201955015
23
0.208788398902
5.3740318326
24
0.215849848588
5.32429626518
25
0.222266133626
5.26326204427
26
0.228798766931
5.20957192397
27
0.235821282864
5.17059997982
28
0.241924317678
5.11497128805
29
0.247944800059
5.06149384948
30
0.254063117504
5.01351218541
31
0.260935767492
4.98303143081
32
0.2681

259
2.7336218675
6.24827855428
260
2.75704205036
6.27757266852
261
2.78296088378
6.31230974404
262
2.79727191528
6.32055333528
263
2.80550820033
6.31506028363
264
2.81871818304
6.32076198621
265
2.83491323392
6.33308918671
266
2.85878551801
6.36240987467
267
2.88392136892
6.39431254831
268
2.90899281899
6.42583488374
269
2.92202488184
6.43062724925
270
2.93162060181
6.42784961583
271
2.93907083273
6.42040565674
272
2.94762866894
6.415427103
273
2.95786911647
6.41413376171
274
2.96890645027
6.41457160059
275
2.97886793216
6.41269023942
276
2.98741613229
6.40779112433
277
2.99842663209
6.40818976966
278
3.00976968209
6.4092937115
279
3.02044323285
6.40896915358
280
3.0324121515
6.41138569173
281
3.04263283412
6.41010191387
282
3.05278691848
6.40868743171
283
3.06251868407
6.40639950872
284
3.07092599869
6.4013668705
285
3.07798169851
6.39356198428
286
3.08526218335
6.38627696693
287
3.09455253283
6.38318849978
288
3.10487133265
6.3822355171
289
3.11496409973
6.38082611432
290
3.123712384

517
5.78994366725
6.62987746811
518
5.79985383352
6.62840438116
519
5.81009480158
6.62731430162
520
5.82039703528
6.62629816324
521
5.82876686653
6.62309018231
522
5.84021416505
6.62338464696
523
5.85231448412
6.62441715984
524
5.86954811811
6.63124520214
525
5.88174285094
6.63236527192
526
5.89459040165
6.63421581326
527
5.91068241596
6.63970396632
528
5.92728336652
6.6457419564
529
5.94453436534
6.65248458276
530
5.96027195056
6.65751131082
531
5.97733500004
6.6639968362
532
5.99462776581
6.67071360406
533
6.01384961605
6.67954779119
534
6.03611211777
6.69171980097
535
6.05628233353
6.70153110552
536
6.06782058477
6.70177198915
537
6.07595738173
6.69826214149
538
6.08477894862
6.6955188431
539
6.09273581902
6.6918360016
540
6.1017291824
6.68930310367
541
6.11042143504
6.68645007309
542
6.12111335198
6.68579170548
543
6.13033455213
6.68353232939
544
6.14015270074
6.68193088022
545
6.14963481824
6.67997029797
546
6.15928396781
6.67819800173
547
6.16946803331
6.67701110735
548
6.1821835