# Using the OSF API to find download URLs

This Jupyter notebook is based on https://osf.io/rs986/.

Data are extracted from my project at https://osf.io/befgz.

In [1]:
import json
import requests
import os
import time
import re
import pandas as pd

## Authentication

The OSF API uses a token for authentication. To get this token:
* create an account on the OSF
* login to that account
* create an API token by visiting your settings page

The best way to use an OSF API token is to add it to your environment variables. In Ubuntu 16.04:
* Open a terminal (by pressing Ctrl Alt T )
* sudo -H gedit /etc/environment.
* Type your password.
* Edit the text file just opened: ...
* Save it.
* Once saved, logout and login again.
* Your required changes are made.

My **/etc/environment** file looks like this:
```
PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games"
OSF_PASSWORD="<my password>"
OSF_TOKEN="<my token>"
```  

In [2]:
OSF_API_URL = 'https://api.osf.io/v2/'
OSF_TOKEN = os.environ['OSF_TOKEN']
# Alternative: simply supply OSF_TOKEN as a string
# OSF_TOKEN = 'your_token_goes_here'

In [3]:
# helper function which adds authentication to our API requests

def get_request(url):
    headers = {'Authorization': 'Bearer {}'.format(OSF_TOKEN)}
    return requests.get(url, headers=headers)

In [4]:
# helper function to print out our API responses

def pretty_print(json_data):
    print(json.dumps(json_data, indent=4))

## Extracting URLs for Images

Images for my project are stored in this structure:

* OSF Storage
    * Images
        * all_sheets
        * labels
        
**all_sheets** contains 5,590 images of herbarium sheets

**labels** contains images of labels extracted from the herbarium sheets

In [5]:
# Get paths for folders

url = 'https://api.osf.io/v2/nodes/befgz/files/osfstorage/?format=jsonapi'
url = url + '&filter[name]=images'
r = get_request(url).json()
images_folder_path = r['data'][0]['attributes']['path']
print 'images_folder_path: ' + images_folder_path

url = OSF_API_URL + 'nodes/befgz/files/osfstorage{}?format=jsonapi'.format(images_folder_path)
url = url + '&filter[name]=all_sheets'
r = get_request(url).json()
all_sheets_folder_path = r['data'][0]['attributes']['path']
print 'all_sheets_folder_path: ' + all_sheets_folder_path

url = OSF_API_URL + 'nodes/befgz/files/osfstorage{}?format=jsonapi'.format(images_folder_path)
url = url + '&filter[name]=labels'
r = get_request(url).json()
labels_folder_path = r['data'][0]['attributes']['path']
print 'labels_folder_path: ' + labels_folder_path

images_folder_path: /5a1bb4d2594d90026ef23992/
all_sheets_folder_path: /5a1c936a594d90026ef2bb32/
labels_folder_path: /5a2b60f6e08e9b000f9b196c/


In [6]:
# Get materialized paths and download URLs for all files in osfstorage/images/labels and write these to a file

f = open('labels.txt', 'w')
for page in range(1,1000):
    print 'Requesting page {}'.format(page)
    # Change number of files per page from 10 (default) to 100 (max allowed) to speed up requests a bit
    url = '''{}nodes/befgz/files/osfstorage{}?format=jsonapi&page[size]=100&page={}
       '''.format(OSF_API_URL, labels_folder_path, page)
    try:
        r = get_request(url)
        j = r.json()
    except:
        print('bad response; retrying in 10s')
        time.sleep(10)
        r = get_request(url)
        j = r.json()
    for a in j['data']:
        materialized_path = a['attributes']['materialized_path']
        download_url = a['links']['download']
        f.write('{}, {}\n'.format(materialized_path, download_url))
    if j['links']['next']==None:
        f.close()
        print 'Finished list of images stored in osfstorage/images/labels'
        print 'See labels.txt'
        break

Requesting page 1
Requesting page 2
Requesting page 3
Requesting page 4
Requesting page 5
Requesting page 6
Requesting page 7
Requesting page 8
Requesting page 9
Requesting page 10
Requesting page 11
Requesting page 12
Requesting page 13
Requesting page 14
Requesting page 15
Requesting page 16
Requesting page 17
Requesting page 18
Requesting page 19
Requesting page 20
Requesting page 21
Requesting page 22
Requesting page 23
Requesting page 24
Requesting page 25
Requesting page 26
Requesting page 27
Requesting page 28
Requesting page 29
Requesting page 30
Requesting page 31
Requesting page 32
Requesting page 33
Requesting page 34
Requesting page 35
Requesting page 36
Requesting page 37
Requesting page 38
Requesting page 39
Requesting page 40
Requesting page 41
Requesting page 42
Requesting page 43
Requesting page 44
Requesting page 45
Requesting page 46
Requesting page 47
Requesting page 48
Requesting page 49
Requesting page 50
Requesting page 51
Requesting page 52
Requesting page 53
Re

In [7]:
# Get materialized paths and download URLs for all files in osfstorage/images/labels and write these to a file

f = open('all_sheets.txt', 'w')
for page in range(1,1000):
    print 'Requesting page {}'.format(page)
    # Change number of files per page from 10 (default) to 100 (max allowed) to speed up requests a bit
    url = '''{}nodes/befgz/files/osfstorage{}?format=jsonapi&page[size]=100&page={}
       '''.format(OSF_API_URL, all_sheets_folder_path, page)
    try:
        r = get_request(url)
        j = r.json()
    except:
        print('bad response; retrying in 10s')
        time.sleep(10)
        r = get_request(url)
        j = r.json()
    for a in j['data']:
        materialized_path = a['attributes']['materialized_path']
        download_url = a['links']['download']
        f.write('{}, {}\n'.format(materialized_path, download_url))
    if j['links']['next']==None:
        f.close()
        print 'Finished list of images stored in osfstorage/images/all_sheets'
        print 'See all_sheets.txt'
        break

Requesting page 1
Requesting page 2
Requesting page 3
Requesting page 4
Requesting page 5
Requesting page 6
Requesting page 7
Requesting page 8
Requesting page 9
Requesting page 10
Requesting page 11
Requesting page 12
Requesting page 13
Requesting page 14
Requesting page 15
Requesting page 16
Requesting page 17
Requesting page 18
Requesting page 19
Requesting page 20
Requesting page 21
Requesting page 22
Requesting page 23
Requesting page 24
Requesting page 25
Requesting page 26
Requesting page 27
Requesting page 28
Requesting page 29
Requesting page 30
Requesting page 31
Requesting page 32
Requesting page 33
Requesting page 34
Requesting page 35
Requesting page 36
Requesting page 37
Requesting page 38
Requesting page 39
Requesting page 40
Requesting page 41
Requesting page 42
Requesting page 43
Requesting page 44
Requesting page 45
Requesting page 46
Requesting page 47
Requesting page 48
Requesting page 49
Requesting page 50
Requesting page 51
Requesting page 52
Requesting page 53
Re

In [14]:
pd.options.display.max_rows = 4
pd.options.display.max_colwidth = 100

In [15]:
df_labels = pd.read_csv('labels.txt', header=None)
df_labels.columns = ['url1', 'url2']
df_labels

Unnamed: 0,url1,url2
0,/images/labels/39800.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b618fe08e9b000e9b3690
1,/images/labels/37947.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b6190e08e9b000f9b19dd
...,...,...
5588,/images/labels/41532.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8456e08e9b000d9bd45b
5589,/images/labels/41986.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8a1ce08e9b000d9bdfeb


In [16]:
df_all_sheets = pd.read_csv('all_sheets.txt', header=None)
df_all_sheets.columns = ['url1', 'url2']
df_all_sheets

Unnamed: 0,url1,url2
0,/images/all_sheets/37669.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1c9433594d900270f25280
1,/images/all_sheets/37649.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1c9434b83f69026f97741f
...,...,...
5588,/images/all_sheets/41230.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2a8e73e08e9b000f9ab426
5589,/images/all_sheets/39137.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2a8f46e08e9b000e9ac40e


In [17]:
n_list = []
for index, row in df_labels.iterrows():
    n = int(re.findall('\\d+', row.url1)[0])
    n_list.append(n)
df_labels['n'] = n_list
df_labels

Unnamed: 0,url1,url2,n
0,/images/labels/39800.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b618fe08e9b000e9b3690,39800
1,/images/labels/37947.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b6190e08e9b000f9b19dd,37947
...,...,...,...
5588,/images/labels/41532.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8456e08e9b000d9bd45b,41532
5589,/images/labels/41986.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8a1ce08e9b000d9bdfeb,41986


In [18]:
n_list = []
for index, row in df_all_sheets.iterrows():
    n = int(re.findall('\\d+', row.url1)[0])
    n_list.append(n)
df_all_sheets['n'] = n_list
df_all_sheets

Unnamed: 0,url1,url2,n
0,/images/all_sheets/37669.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1c9433594d900270f25280,37669
1,/images/all_sheets/37649.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1c9434b83f69026f97741f,37649
...,...,...,...
5588,/images/all_sheets/41230.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2a8e73e08e9b000f9ab426,41230
5589,/images/all_sheets/39137.JPG,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2a8f46e08e9b000e9ac40e,39137


In [19]:
df_urls = df_labels.merge(df_all_sheets, on='n')
df_urls.drop(['url1_x','url1_y'], axis=1, inplace=True)
df_urls.columns = ['label_url', 'image_number', 'sheet_url']
df_urls = df_urls[['image_number', 'sheet_url', 'label_url']]
df_urls.to_csv('urls.csv', index=False)
df_urls

Unnamed: 0,image_number,sheet_url,label_url
0,39800,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2240f06c613b027a254e29,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b618fe08e9b000e9b3690
1,37947,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a223eb1b83f690266bf3117,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b6190e08e9b000f9b19dd
...,...,...,...
5588,41532,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1cf5a4b83f690271972d7f,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8456e08e9b000d9bd45b
5589,41986,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a1ff2749ad5a1026710ec34,https://files.osf.io/v1/resources/befgz/providers/osfstorage/5a2b8a1ce08e9b000d9bdfeb
