In [1]:
import gzip
import json
import os

from __future__ import print_function, unicode_literals

In [2]:
catalog_dir = os.path.join('..','catalogs')

In [3]:
files = os.listdir(catalog_dir)

In [4]:
files

[u'ecoinvent_3.2_apos_xlsx.json.gz',
 u'ecoinvent_3.2_consequential_xlsx.json.gz',
 u'ecoinvent_3.2_cut-off_xlsx.json.gz',
 u'ecoinvent_3.2_undefined_xlsx.json.gz',
 u'elcd_3.2.json.gz',
 u'gabi_2016_all-extensions.json.gz',
 u'gabi_2016_professional-database-2016.json.gz',
 u'uslci_ecospold.json.gz']

## Select databases of interest

In [5]:
my_files = [os.path.join(catalog_dir, files[k]) for k in (3, 4, 6, 5, 7)]  # we love list comprehensions!

In [6]:
my_files

[u'..\\catalogs\\ecoinvent_3.2_undefined_xlsx.json.gz',
 u'..\\catalogs\\elcd_3.2.json.gz',
 u'..\\catalogs\\gabi_2016_professional-database-2016.json.gz',
 u'..\\catalogs\\gabi_2016_all-extensions.json.gz',
 u'..\\catalogs\\uslci_ecospold.json.gz']

Give the databases short names for reference

In [7]:
names = ['EI (u)', 'ELCD', 'GaBi-Pro', 'GaBi-Ext', 'US LCI']

In [8]:
def load_archive(filename):
    with gzip.open(filename, 'r') as fp:
        J = json.load(fp)

    # GaBi datasets are collections of archives, whereas the others are just archives
    if 'archives' in J:
        return J['archives'][0]
    else:
        return J


Load all the archives into a collection called 'C'

In [9]:
C = [load_archive(fname) for fname in my_files]

(should take about 2-5 seconds)

## Create geography grid (Table 4 in manuscript)

In [10]:
geog = []
for i, archive in enumerate(C):
    for p in archive['processes']:
        geog.append({'db': names[i], 'process': p['tags']['Name'], 'geog': p['tags']['SpatialScope']})

there should be one entry in `geog` for each process listed in a database - total of around 25,000 processes

In [12]:
len(geog)

25287

use `pandas` to draw the pivot charts

In [13]:
import pandas as pd

In [14]:
P = pd.DataFrame(geog).pivot_table(index='geog', columns='db',  aggfunc=len, fill_value='', margins=True)

In [15]:
P.sort_values(by=('process','All'), ascending=False)[:20]  # only show the top 20 rows

Unnamed: 0_level_0,process,process,process,process,process,process
db,EI (u),ELCD,GaBi-Ext,GaBi-Pro,US LCI,All
geog,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
All,13307.0,503.0,7457.0,3319.0,701.0,25287.0
GLO,6218.0,25.0,446.0,338.0,15.0,7042.0
DE,168.0,19.0,2131.0,314.0,,2632.0
US,92.0,,1179.0,137.0,16.0,1424.0
RNA,13.0,,649.0,19.0,667.0,1348.0
CH,1260.0,10.0,44.0,33.0,,1347.0
RER,1136.0,75.0,14.0,84.0,3.0,1312.0
EU-27,,96.0,296.0,869.0,,1261.0
CA-QC,346.0,,,,,346.0
IN,60.0,,187.0,55.0,,302.0


## Count Reference Flow Frequency (table 3 in manuscript)

In [16]:
from collections import Counter

In [18]:
def create_flow_map(archive):
    """
    This function creates a hash map from the entityId to the entity's tags- very fast
    """
    flow_map = dict()
    for f in archive['flows']:
        flow_map[f['entityId']] = f['tags']
    return flow_map


In [19]:
def count_ref_flows(archive):
    rfs = Counter()
    flow_map = create_flow_map(archive)
    for i in archive['processes']:
        x = i['referenceExchange']
        if x == "None":
            count_key = (None, None)
        else:
            direc, flowref = x.split(': ')
            try:
                flowname = flow_map[flowref]['Name']
            except KeyError:
                flowname = flow_map[int(flowref)]['Name']
            except KeyError:
                flowname = 'Flow Not Found!'
            count_key = (direc, flowname)
        rfs[count_key] += 1
    return rfs


In [20]:
rf_count = []
for i, archive in enumerate(C):
    print('Parsing archive %s' % names[i])
    rfs = count_ref_flows(archive)
    for rf, count in rfs.items():
        try:
            rf_count.append({'db': names[i], 'exchange': '%s: %s' % (rf[0], rf[1]), 'count': count})
        except TypeError:
            print('rf: %s (type %s) count: %d' %(rf, type(rf), count))

Parsing archive EI (u)
Parsing archive ELCD
Parsing archive GaBi-Pro
Parsing archive GaBi-Ext
Parsing archive US LCI


(should take << 1 second)

In [21]:
RF = pd.DataFrame(rf_count).pivot_table(index='exchange', columns='db', aggfunc=sum, fill_value='', margins=True)

In [22]:
RF.sort_values(('count','All'), ascending=False)[:20]

Unnamed: 0_level_0,count,count,count,count,count,count
db,EI (u),ELCD,GaBi-Ext,GaBi-Pro,US LCI,All
exchange,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
All,13307.0,503.0,7457.0,3319.0,701.0,25287.0
"Output: electricity, high voltage",1974.0,,,,,1974.0
Output: Thermal energy (MJ),,,944.0,236.0,,1180.0
Output: Electricity,,64.0,522.0,471.0,,1057.0
Output: Steam (MJ),,,340.0,622.0,,962.0
"Output: electricity, low voltage",730.0,,,,,730.0
Input: Housing technology,,,340.0,191.0,,531.0
"Output: electricity, medium voltage",423.0,,,,,423.0
"Output: heat, district or industrial, other than natural gas",300.0,,,,,300.0
Output: Cargo,,,80.0,127.0,,207.0


## Text Co-occurrence (table 5 in manuscript)