In [2]:
import sys, os
import urllib.request
import pandas as pd
import zipfile
import logging
import time

%run ./common/file_utility
%run ./common/treaty_state

skip_columns = [ 'extra_entry', 'dbflag', 'french', 'other', 'regis', 'regisant', 'force', 'group1', 'group2' ]

state = TreatyState(skip_columns=skip_columns).process()

# TODO Lägg nedladdade filer i en ZIP-fil


Imported: Treaties_Master_List_Treaties.csv
Imported: country_continent.csv
Imported: parties_curated_parties.csv
Imported: parties_curated_continent.csv
Imported: parties_curated_group.csv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


Number of treaties loaded: 61365
Number of cultural treaties: 2063 (total), 1127 within periods


In [34]:


def filter_treaties(wti_treaties):
    #global processed_gof_treaties
    data = wti_treaties
    # processed_treaty_id = [ x[0] for x in processed_gof_treaties ]

    data = data.loc[
        (data.is_cultural==True)&
        (data.signed_year.between(1945,1972))&
        (data.source.isin(['UNTS', 'UNXX']))&
        (data.english!='en')&
        #(~data.index.isin(processed_treaty_id))&
        (data.ispartyof4!='Yes')
    ]
    return data


data = filter_treaties(state.treaties)

FileUtility.save_excel([(data, 'Data')], 'unt-log-page.xls')


In [25]:

def fetch_pdf(url, pdf_path):
    try:
        response = urllib.request.urlopen(url)
        http_code = response.getcode()
        if http_code in [ 200 ]:
            pdf = response.read()
            with open(pdf_path,'wb') as output:
                output.write(pdf)
        return http_code
    except Exception as ex:
        #print(str(ex))
        #print(url)
        return 0

def get_candidate_names(treaty_id, volume, language):
    template = 'volume-{0}-{3}-{1}-{2}.pdf'
    return [
        template.format(volume, int(str(treaty_id)[-5:]), language, 'I'),
        template.format(volume, int(str(treaty_id)[-5:]), language, 'II')
    ]
        
def ts_data_path(data_dir, filename):
    return os.path.join(data_dir, '{}_{}'.format(time.strftime("%Y%m%d%H%M"), filename))

def scrape_treaty(treaty_id, volume, language, pdf_filename):
    url_template = 'https://treaties.un.org/doc/Publication/UNTS/Volume%20{0}/{1}'
    for candidate_name in get_candidate_names(treaty_id, volume, language):
        url = url_template.format(volume, candidate_name)
        http_code = fetch_pdf(url, pdf_filename)
        if http_code in [ 200 ]:
            return http_code, url
    return http_code, None

def scrape_treaties(data, languages):
    
    scrape_log = pd.DataFrame(columns=['treaty_id', 'volume', 'page', 'party1', 'party2', 'signed', 'language', 'http_code', 'url'])

    for treaty_id, treaty in data.iterrows():

        volume = treaty['vol']

        for (language, lang) in languages:

            pdf_filename = os.path.join('./data', "{}-{}.pdf".format(treaty_id, lang))

            if (os.path.isfile(pdf_filename)):
                print('Already scraped: {}'.format(pdf_filename))
                continue
                
            http_code, url = scrape_treaty(treaty_id, volume, language, pdf_filename)   

            print('{}/{};{};{};{}'.format(volume, treaty_id, language,  http_code, url or ''))

            scrape_log = scrape_log.append({
                    'treaty_id' : treaty_id,
                    'volume' : treaty['vol'],
                    'page' : treaty['page'],
                    'party1': treaty['party1'],
                    'party2': treaty['party2'],
                    'signed': treaty['signed'],
                    'language': language,
                    'http_code': http_code,
                    'url': url or ''
            }, ignore_index=True)

    return scrape_log


In [111]:

languages = [ ('English', 'en') ] #, ('French', 'fr'), ('Other', 'other') ]

data = filter_treaties(state.treaties)

scrape_log = scrape_treaties(data, languages)

filename = ts_data_path('../data', 'unts-en-download-log.xlsx')
FileUtility.save_excel([(scrape_log, 'Data')], filename)


2082/15407;English;0;
10/100139;English;0;
15/100230;English;0;
Already scraped: ./data/100266-en.pdf
25/100366;English;0;
25/100368;English;0;
32/100487;English;0;
33/100514;English;0;
33/100515;English;0;
33/100516;English;0;
34/100528;English;0;
41/100641;English;0;
46/100698;English;0;
46/100699;English;0;
46/100707;English;0;
76/100982;English;0;
77/101003;English;0;
116/101568;English;0;
162/102135;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20162/volume-162-I-2135-English.pdf
178/102333;English;0;
178/102334;English;0;
178/102343;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20178/volume-178-I-2343-English.pdf
182/102425;English;0;
188/102533;English;0;
191/102583;English;0;
201/102708;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20201/volume-201-I-2708-English.pdf
211/102854;English;0;
230/103187;English;0;
241/103439;English;0;
250/103525;English;0;
252/103569;English;0;
257/103660;English;0;
259/103687;English;0;
259/10

502/107324;English;0;
514/107448;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20514/volume-514-I-7448-English.pdf
515/107458;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20515/volume-515-I-7458-English.pdf
515/107464;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20515/volume-515-I-7464-English.pdf
519/107505;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20519/volume-519-I-7505-English.pdf
520/107513;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20520/volume-520-I-7513-English.pdf
521/107531;English;0;
522/107534;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20522/volume-522-I-7534-English.pdf
522/107551;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20522/volume-522-I-7551-English.pdf
528/107638;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20528/volume-528-I-7638-English.pdf
528/107639;English;200;https://treaties.un.org/doc/Publication/UNTS/

671/109558;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20671/volume-671-I-9558-English.pdf
671/109560;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20671/volume-671-I-9560-English.pdf
672/109568;English;0;
672/109573;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20672/volume-672-I-9573-English.pdf
688/109847;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20688/volume-688-I-9847-English.pdf
688/109849;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20688/volume-688-I-9849-English.pdf
689/109873;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20689/volume-689-I-9873-English.pdf
695/109954;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20695/volume-695-I-9954-English.pdf
708/110173;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20708/volume-708-I-10173-English.pdf
708/110179;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20708/volume-7

956/113702;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20956/volume-956-I-13702-English.pdf
956/113704;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20956/volume-956-I-13704-English.pdf
956/113705;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20956/volume-956-I-13705-English.pdf
957/113715;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20957/volume-957-I-13715-English.pdf
957/113718;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20957/volume-957-I-13718-English.pdf
975/114138;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20975/volume-975-I-14138-English.pdf
975/114140;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%20975/volume-975-I-14140-English.pdf
1025/115053;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%201025/volume-1025-I-15053-English.pdf
1035/115407;English;200;https://treaties.un.org/doc/Publication/UNTS/Volume%201035/volume-1035-I-1540

In [114]:
%%bash
#sudo apt-get install poppler-utils
cd ./data
find -name '*.pdf' -exec pdftotext -eol dos "{}" \;