# Analyze of Moles DB fixture

## Save load

In [1]:
import json
import gzip

# loads json file to dictionary
def json_to_dict(filename):
    with gzip.open(filename, 'r') as f:
        return json.loads(f.read().decode('utf-8'))

# saves dictionary to json file
def dict_to_json(data, filename):
    with gzip.open(filename, 'w') as f:
        f.write(json.dumps(data).encode('utf-8'))   

## Flattening fixture

### Refereanceable

In [3]:
# returns model's name
def get_model_name(obj):
    return obj['model'].split('.')[1]

In [4]:
# maps shortcode to model name

map = {
    'acq': 'acquisition',
    'cmppr': 'compositeprocess',
    'coll': 'observationcollection',
    'comp': 'computation',
    'instr': 'instrument',
    'mpop': 'mobileplatformoperation',
    'ob': 'observation',
    'plat': 'platform',
    'proj': 'project',
    'result': 'result',
    'excit': 'externalcitation',
    }
def map_shortcode_to_model_name(short_code, full=False):
    if full:
        return f'cedamoles_app.{map[short_code]}'
    return map[short_code]

In [5]:
# returns dict of referenceable objects where keys are PKs and values are fields of those objects

def get_referenceable_dict_from_list(ref_list):
    output = dict()
    for i in ref_list:
        output[i['pk']] =  i['fields']

    return output

In [6]:
# inserts UUIDs into fields of corresponding models 

def include_referenceable(data):
    output = []
    referenceable = [i for i in data if i['model'] == 'cedamoles_app.referenceable']
    referenceable = {i['pk']: i['fields'] for i in referenceable}

    data = [i for i in data if i['model'] != 'cedamoles_app.referenceable']
    
    for i in data:
        pk = i['pk']
        if pk in referenceable and i['model'] == map_shortcode_to_model_name(referenceable[pk]['short_code'] , True):
            my_obj = i
            my_obj['fields']['uuid'] = referenceable[pk]['uuid']
            output.append(my_obj)
        else:
            output.append(i)
            
    return output


### Inserting models into models

In [7]:
# e.g. if observation has a filed phenomena which refers to the phenomena model by its PK, that model will be inserted into the observation

def include_simple_field(data, model_name, model_and_field_pairs_to_insert_model):
    values = {i['pk']: i['fields'] for i in data if i['model'] == f'cedamoles_app.{model_name}'}
    data = [i for i in data if i['model'] != f'cedamoles_app.{model_name}']

    for i in data:
        for m, f in model_and_field_pairs_to_insert_model:
            if i['model'] == f'cedamoles_app.{m}' and f in i['fields'] and i['fields'][f]:
                if isinstance(i['fields'][f], list):
                    i['fields'][f] = [values[j] for j in i['fields'][f]]
                else:
                    i['fields'][f] = values[i['fields'][f]]
            
    return data

In [8]:
# e.g. if relation between 2 models is done via foreign key (identifiers and observations) then fields from non-referenceable will be added to the referenceable

def include_on_foreign_key(data, model_name, new_field_name=''):
    output = []
    ids = [i for i in data if i['model'] == f'cedamoles_app.{model_name}']

    rel_field_name = 'relatedTo' if 'relatedTo' in ids[0]['fields'] else 'ob_ref' if 'ob_ref' in ids[0]['fields'] else 'relatedRecord'

    ids_dict = dict()
    for i in ids:
        if i['fields'][rel_field_name] not in ids_dict:
            ids_dict[i['fields'][rel_field_name]] = i['fields']
        
        elif not isinstance(ids_dict[i['fields'][rel_field_name]], list):
            ids_dict[i['fields'][rel_field_name]] = [ids_dict[i['fields'][rel_field_name]], i['fields']]

        else:
            ids_dict[i['fields'][rel_field_name]].append(i['fields'])


    data = [i for i in data if i['model'] != f'cedamoles_app.{model_name}']
    for i in data:
        if 'uuid' in i['fields'] and i['pk'] in ids_dict:
            if isinstance(ids_dict[i['pk']], list):
                fields = [{k: v for k, v in j.items() if k != rel_field_name} for j in ids_dict[i['pk']]]
            else:
                fields = {k: v for k, v in ids_dict[i['pk']].items() if k != rel_field_name}

            i['fields'][new_field_name if new_field_name else model_name] = fields
        
        output.append(i)
    
    return output

In [9]:
# this fields is unique 

def include_related_obs_info(data):
    output = []
    ids = [i for i in data if i['model'] == 'cedamoles_app.relatedobservationinfo']
    ids = {i['fields']['objectObservation']: i['fields'] for i in ids}
    data = [i for i in data if i['model'] != 'cedamoles_app.relatedobservationinfo']

    for i in data:
        if i['pk'] in ids:
            fields = {k: v for k, v in ids[i['pk']].items() if k != 'objectObservation'}
            i['fields']['relatedObservationInfo'] = fields
        
        output.append(i)
    
    return output

### Creating relations and removing PKs

In [10]:
# maps PK to UUID

my_fixture = json_to_dict('fixture2.json.gz')
my_map = {i['pk']: i['fields']['uuid'] for i in my_fixture}

def map_pk_to_uuid(pk):
    try:
        return my_map[pk]
    except KeyError:
        return ''

In [11]:
def replace_pks(obj, field):
    if not field and isinstance(obj, int):
        return map_pk_to_uuid(obj)

    if isinstance(obj, list):
        return [replace_pks(i, field) for i in obj]

    if not field or field[0] not in obj:
        return obj
    
    obj[field[0]] = replace_pks(obj[field[0]], field[1:])

    return obj
    

In [10]:
# replaces PHs with UUIDs

list_of_fields = ['independentInstrument',
                'mobilePlatformOperation',
                'computationComponent',
                'acquisitionComponent',
                'subInstrument',
                'platform_field',
                'result_field',
                'procedureAcquisition',
                'procedureComputation',
                'procedureCompositeProcess',
                'projects',
                'member',
                'parentProject',
                'observationCollection',
                'relatedObservationInfo/subjectObservation',
                'instrumentPlatformPair/instrument',
                'instrumentPlatformPair/platform',
                'note/relatedRecord',
                'reviewNote/commentator',
                'softwareReference',
                'childPlatform',
                'oldDataPath'
                ]

def swap_pks_to_uuids(data):
    output = []
    for i in data:
        for f in list_of_fields:
            f = f.split('/')
            i['fields'] = replace_pks(i['fields'], f)
        
        output.append(i)
        
    return output

In [13]:
# rearrange structure of records by removing PKs and bringing fields to the upper level

def remove_pks(data):
    output = []
    for i in data:
        model = i['model'].split('.')[1]
        i = i['fields']
        i['model'] = model
        output.append(i)
    
    return output

In [14]:
# removes specified field from dictionary recursively

def remove_from_dict(data, field_path):
    if data is None: 
        return
        
    field_path = field_path.split('/')

    if field_path[0] not in data:
        return data

    if not field_path[1:]:
        return {k: v for k, v in data.items() if k != field_path[0]}

    data[field_path[0]] = remove_from_dict(data[field_path[0]], '/'.join(field_path[1:]))
    
    return data
  
    

In [15]:
# removes specified field of one of the models from dictionary

def remove_field_from_model(data, model_name, field_path):
    output = []
 
    for i in data:
        try:
            if i['model'] == model_name:
                i = remove_from_dict(i, field_path)
            output.append(i)
        except:
            print(f'Problem with path {field_path}. i = {i}')
    return output

In [16]:
# takes list of model-fieldpath pairs, removes them from the fixture and saves result to the new one

def remove_fields_from_models_in_fixture(list_of_model_fieldpath_pairs, file_in = 'fixture3.json.gz', file_out = 'fixture4.json.gz'):
    data = json_to_dict(file_in)
    
    for m, f in list_of_model_fieldpath_pairs:
        data = remove_field_from_model(data, m, f)
        

    dict_to_json(data, file_out)


### Analyzing non empty values

In [17]:
def count_not_nulls_helper(path, obj, out_dict):
    if (obj != 0 and not obj) or obj is None or obj == ' ':
        out_dict[path] = out_dict.get(path, 0)
        return

    if isinstance(obj, list):
        for i in obj:
            # out_dict[path] = out_dict.get(path, 0) + 1
            count_not_nulls_helper(path, i, out_dict)
        return
        
    out_dict[path] = out_dict.get(path, 0) + 1

    if isinstance(obj, dict):
        for k, v in obj.items():
            count_not_nulls_helper(f'{path}/{k}', v, out_dict)


In [18]:
# counts how many non empty/non None values of certain field appeared in the database

def count_not_nulls(grouped_data, group_by_model = True):
    output = dict()

    for m, l in grouped_data.items():
        for f in l:
            if group_by_model:
                count_not_nulls_helper(m, f, output)
            else:
                count_not_nulls_helper('', f, output)
    
    return output

In [19]:
# groups records by model

def group_by_model(data):
    output = dict()
    models = set([i['model'] for i in data])

    for m in models:
        output[m] = []
    
    for i in data:
        output[i['model']].append(i)
    
    return output

In [20]:
# saves distribution of values to the txt file

def save_distribution_to_the_file(data, filename='distribution.txt', group_by_count = False, group_models = True):
    
    dicted_data = group_by_model(data)
    distribution = count_not_nulls(dicted_data, group_models)

    if not group_by_count:
        with open(filename, 'w') as f:
            k1 = ''
            for k, v in distribution.items():
                model = k.split('/')[0]

                if k1 != model:
                    f.write('\n')

                k1 = model
                f.write(f'{k}: {v}\n')
        return

    
    result = dict()

    for k, v in distribution.items():
        if v in result:
            result[v].append(k)
        else:
            result[v] = [k]
    
    result = dict(sorted(result.items()))

    filename = filename.split('.')[0]
    with open(f'{filename}_grouped.txt', 'w') as f:
        for k,v in result.items():
            f.write(f'{k}:\n')
            for i in v:
                f.write(f'\t{i}\n')

### Removing fields

In [21]:
# removes specified field from dictionary recursively

def remove_from_dict(data, field_path):
    if data is None: 
        return
        
    field_path = field_path.split('/')

    if field_path[0] not in data:
        return data

    if not field_path[1:]:
        return {k: v for k, v in data.items() if k != field_path[0]}

    data[field_path[0]] = remove_from_dict(data[field_path[0]], '/'.join(field_path[1:]))
    
    return data
  

In [22]:
# removes specified field of one of the models from dictionary

def remove_field_from_model(data, model_name, field_path):
    output = []
 
    for i in data:
        try:
            if i['model'] == model_name:
                i = remove_from_dict(i, field_path)
            output.append(i)
        except:
            print(f'Problem with path {field_path}. i = {i}')
    return output

In [23]:
# takes list of model-fieldpath pairs, removes them from the fixture and saves result to the new one

def remove_fields_from_models_in_fixture(list_of_model_fieldpath_pairs, file_in = 'fixture3.json.gz', file_out = 'fixture4.json.gz'):
    data = json_to_dict(file_in)
    
    for m, f in list_of_model_fieldpath_pairs:
        data = remove_field_from_model(data, m, f)
        

    dict_to_json(data, file_out)

### Saving structure of fixture

In [24]:
# compress all the records under model to one record with non empty fields (if possible) which will be representative

def compress_groups(data_by_model):
    output = dict()
    for m, l in data_by_model.items():
        output[m] = dict()
        for i in l:
            for k, v in i.items():
                if k not in output[m] or not output[m][k]:
                    output[m][k] = v
    
    return output

In [25]:
# convert values to the type name

def convert_values_to_types(data):

    if isinstance(data, dict):
        for k, v in data.items():
            data[k] = convert_values_to_types(v)
        return data

    elif isinstance(data, list):
        if data:
            return f'[{convert_values_to_types(data[0])}]'
        return []

    if isinstance(data, str) and len(data) == 32:
        return 'uuid'

    return data.__class__.__name__   

In [26]:
# save dictionary in the yaml like format to make it readible

def save_formatted(obj, file, padding = ''):
    if isinstance(obj, list):
        for i in obj:
            save_formatted(i, file, padding)

        
    elif isinstance(obj, dict):
        for k, v in obj.items():
            file.write(f'{padding}{k}:\n')
            save_formatted(v, file, padding + '\t')
            if not padding:
                file.write('\n')
    
    else:
        file.write(f'{padding}{obj}\n')

In [27]:
# save structure of fixture to the file

def save_structure(data, suffix='', keep_values=False):
    data = group_by_model(data)
    data = compress_groups(data)
    if not keep_values:
        data = {k: convert_values_to_types(v) for k,v in data.items()}
    
    with open(f'structure{suffix}.txt', 'w') as f:
        save_formatted(data, f)

## Main script

### Fixture1.5 - tiding up

In [28]:
data = json_to_dict('fixture.json.gz')
data = [i for i in data if i['model'].split('.')[0] == 'cedamoles_app']
dict_to_json(data, 'fixture1.5.json.gz')

### Fixture2 - flat fixture

In [29]:
# after all the interation fixture is flatten down to the 11 refereanceable models and saved as fixture2.json

data = json_to_dict('fixture1.5.json.gz')
data = include_referenceable(data)
data = include_simple_field(data, 'discoveryserviceid', [('observation', 'discoveryKeywords'), ('observationcollection', 'discoveryKeywords')])
data = include_simple_field(data, 'dqconformanceresult', [('observation', 'resultQuality')])
data = include_simple_field(data, 'constraints', [('imagedetails', 'imageConstraints'),('observation', 'permission')])
data = include_simple_field(data, 'imagedetails', [('acquisition', 'imageDetails'), 
                                                  ('computation', 'imageDetails'),
                                                  ('instrument', 'imageDetails'),
                                                  ('observationcollection', 'imageDetails'),
                                                  ('observation', 'imageDetails'),
                                                  ('platform', 'imageDetails'),
                                                  ('project', 'imageDetails'),
                                                  ])
data = include_simple_field(data, 'vocabularyterm', [('observation', 'vocabularyKeywords')])
data = include_simple_field(data, 'verticalextent', [('observation', 'verticalExtent')])
data = include_simple_field(data, 'timeperiod', [('mobileplatformoperation', 'operationTime'),
                                                 ('observation', 'timePeriod'),
                                                 ('observation', 'validTimePeriod')])
data = include_simple_field(data, 'party', [('responsiblepartyinfo', 'party'),
                                            ('review', 'commentator'),
                                            ('review', 'reviewer'),
                                            ('note', 'commentator')])
data = include_simple_field(data, 'phenomenonname', [('phenomenon', 'names')])
data = include_simple_field(data, 'phenomenonterm', [('phenomenon', 'terms')])
data = include_simple_field(data, 'phenomenon', [('observation', 'phenomena')])
data = include_simple_field(data, 'geographicboundingbox', [('mobileplatformoperation', 'location'),
                                                            ('observation', 'geographicExtent'),
                                                            ('platform', 'location')])

data = include_on_foreign_key(data, 'drsdataset', 'drsDataset')
data = include_on_foreign_key(data, 'identifier')
data = include_on_foreign_key(data, 'onlineresource', 'onlineResource')
data = include_on_foreign_key(data, 'migrationproperty', 'migrationProperty')
data = include_on_foreign_key(data, 'note')
data = include_on_foreign_key(data, 'responsiblepartyinfo', 'responsiblePartyInfo')
data = include_on_foreign_key(data, 'review')

data = include_simple_field(data, 'inputoutputdescription', [('computation', 'inputDescription'),
                                                            ('computation', 'outputDescription'),
                                                            ('acquisition', 'outputDescription')])
data = include_on_foreign_key(data, 'instrumentplatformpair', 'instrumentPlatformPair')
data = include_simple_field(data, 'reviewnote', [('review', 'reviewNotes')])



data = include_related_obs_info(data)

dict_to_json(data, 'fixture2.json.gz')




### Fixture3 - relations via UUID; PKs removed

In [30]:
# relations between models are established by UUIDs; PKs got removed; fixture saved as fixture3.json

data = json_to_dict('fixture2.json.gz')
data = swap_pks_to_uuids(data)
data = remove_pks(data)
dict_to_json(data, 'fixture3.json.gz')



In [31]:
data = json_to_dict('fixture3.json.gz')
save_structure(data, '_3')
save_distribution_to_the_file(data, 'distribution_3.txt')
save_distribution_to_the_file(data, 'distribution_3.txt', True)
save_distribution_to_the_file(data, 'distribution_3_m.txt', True, False)

## Fixture3.5 - modifying geo extent to match the ES format

In [32]:
def modify_geo_extent(geo_extent):
    if geo_extent is None:
        return None
        
    output = {'type' : 'envelope',
            'coordinates': []}
    
    output['coordinates'] = [
        [geo_extent['westBoundLongitude'], 
        geo_extent['northBoundLatitude']], 
        [geo_extent['eastBoundLongitude'],
        geo_extent['southBoundLatitude']]]
    
    return output

In [33]:
data = json_to_dict('fixture3.json.gz')

for x, i in enumerate(data):
        if i['model'] == 'observation':
            i['geographicExtent'] = modify_geo_extent(i['geographicExtent'])
            data[x] = i
        
        if i['model'] in ['mobileplatformoperation', 'platform']:
            if 'location' in i:
                i['geographicExtent'] = modify_geo_extent(i['location'])
                i = {k: v for k, v in i.items() if k != 'location'}
                data[x] = i


dict_to_json(data, 'fixture3.5.json.gz')

### Fixture4 - nulls removed

In [34]:
# fields which haven't been used across the entire DB are removed from it; fixture saved as fixture4.json

data = json_to_dict('fixture3.json.gz')
grouped_data = group_by_model(data)
distribution = count_not_nulls(grouped_data)
distribution = {k: v for k, v in distribution.items() if v == 0}

fields_to_be_removed = []
with open('empty_fields.txt', 'w') as f:
    for i in distribution:
        f.write(f'{i}\n')
        sep = i.find('/')
        model = i[:sep]
        path = i[sep + 1:]
        fields_to_be_removed.append((model, path))

fields_to_be_removed
remove_fields_from_models_in_fixture(fields_to_be_removed, 'fixture3.json.gz', 'fixture4.json.gz')


In [35]:
data = json_to_dict('fixture4.json.gz')
save_structure(data, '_4')
save_distribution_to_the_file(data, 'distribution_4.txt')
save_distribution_to_the_file(data, 'distribution_4_grouped.txt', True)

: 

### Fixture5 - irrelevant fields removed after the discussion

In [42]:
# any fields specified in the fields_to_be_removed are be removed from the fixture and new fixture is saved as fixture5.json

fields_to_be_removed = [
    ('result', 'review'),
]

remove_fields_from_models_in_fixture(fields_to_be_removed, 'fixture4.json.gz', 'fixture5.json.gz')

## Everything

### Analyzing sizes of fixtures

In [14]:
def count_elements(data):
    if not isinstance(data, dict) and not isinstance(data, list):
        return 0
    
    result = 0
    if isinstance(data, list):
        for i in data:
            result += count_elements(i)
        return result
    
    for k, v in data.items():
        result += 1
        result += count_elements(v)

    return result 

In [15]:
from pathlib import Path

def analyze_fixtures(filename):
    suffixes = ['1.5', '2', '3', '3.5',]
    with open(filename, 'w') as f:
        f.write('name\tsize[MB]\tnumber_of_models\tnumber_of_fields\n')
        for s in suffixes:
            fname = f'fixture{s}.json.gz'
            data = json_to_dict(fname)
            fsize = Path(fname).stat().st_size / 1000000
            num_of_models = len(data)
            num_of_fields = count_elements(data)
            f.write(f'{fname}\t{fsize}\t{num_of_models}\t{num_of_fields}\n')


In [16]:
analyze_fixtures('analysis_of_size.csv')

### Analyzing deepness of records

In [126]:
def save_analysis_of_deepness(filein, fileout):
    data = json_to_dict(filein)
    data = group_by_model(data)
    data = count_not_nulls(data)   
    
    output = dict()
    for k in data:
        k1 = len(k.split('/')) - 1
        if k1 in output:
            output[k1].append(k)
        else:
            output[k1] = [k] 
    
    with open(fileout, 'w') as f:
        save_formatted(output, f)


### Unzipping fixture

In [14]:
def unzip_fixture(filename):
    data = json_to_dict(filename)

    newname = filename.split('.')
    newname = ".".join(newname[:-1])

    with open(newname, 'w') as f:
        json.dump(data, f, indent=4)

### Other

In [145]:
def is_datetime(value):
    if not value or not isinstance(value, str):
        return False
  
    try:
        if value[4] == '-' and value[7] == '-' and value[10] == 'T':
            return True
    except IndexError:
        return False

In [141]:
def get_date_fields(data, path=''):
    output = dict()

    if not data:
        return output

    if isinstance(data, str) and is_datetime(data):
        output[path] = data

    elif isinstance(data, list):
        for i in data:
            output.update(get_date_fields(i, path))
    
    elif isinstance(data, dict):
        for k, v in data.items():
            output.update(get_date_fields(v, f'{path}/{k}'))
    
    return output
            

In [345]:
def get_rare_attributes(data, number_of_appearance, filename):
    distribution = group_by_model(data)
    distribution = count_not_nulls(distribution, False)

    less_than_x = [k for k, v in distribution.items() if v < number_of_appearance]
    less_than_x = [i.split('/')[1:] for i in less_than_x]   

    output = {'/'.join(k): [] for k in less_than_x}

    for i in data:
        for p in less_than_x:
            field = i
            for f in p:
                if field is not None and f in field:
                    field = field[f]
                else:
                    field = None
                    break

            if field is not None and field:
                output['/'.join(p)].append(f"{i['uuid']} {i['model']}")
    
    with open(filename, 'w') as f:
        save_formatted(output, f)

    return output

In [346]:
data = json_to_dict('fixture3.5.json.gz')
rare_fields = get_rare_attributes(data, 15, 'rare_fields_with_uuids.txt')

In [None]:
rare_fields

In [330]:
def get_by_uuid(data, uuid):
    for i in data:
        if i['uuid'] == uuid:
            return i
    return None

In [349]:
def get_rare_uuids(rare_fields, fieldname):
    return [i.split()[0] for i in rare_fields[fieldname]]

In [None]:
data = json_to_dict('fixture3.5.json.gz')

In [367]:
uuids = get_rare_uuids(rare_fields, 'review/reviewer/phone')

items = []
for u in uuids:
    items.append(get_by_uuid(data, u))

In [368]:
fields_to_pick = ['review', 'uuid']

items_filtered = [{k: v for k, v in i.items() if k in fields_to_pick} for i in items]

In [370]:
items[0]

{'title': 'HADRT2.1: Bias adjusted global monthly fields of radiosonde temperature anomalies (1958-2004)',
 'abstract': 'The HADRT2.1 data are global monthly fields of radiosonde temperature anomalies at standard pressure levels on a 5 degree latitude by 10 degree longitude grid from 1958 to July 2004. \r\nAnomalies are calculated with respect to 1971-1990 climatology. Anomalies are available for 9 standard levels (850, 700, 500, 300, 200, 150, 100, 50, 30hPa) as well as tropospheric (850 - 300hPa) and stratospheric (150 - 30hPa) averages.\r\nThe data are degree Celsius anomalies from 1970-1990 means. Anomalies are calculated for each of about 200 sonde stations worldwide and grid values derived from these. \r\n\r\nHADRT2.1 is as HadRT2.0 but with bias corrections made to many station time series world-wide. The adjustments were calculated by reference to MSU data products, but only for known changes in instrumental or operational procedures for the period post 1979. No data are availa

In [373]:
data = json_to_dict('fixture3.5.json.gz')


In [390]:
items = [i['responsiblePartyInfo'] for i in data if 'responsiblePartyInfo' in i]
counted = [(len(i) if isinstance(i, list) else 1) for i in items]
grouped_by_count = dict()
for i in counted:
    grouped_by_count[i] = grouped_by_count.get(i, 0) + 1

grouped_by_count = dict(sorted(grouped_by_count.items()))

In [393]:
grouped_by_count

{1: 1397,
 2: 5238,
 3: 1069,
 4: 360,
 5: 1688,
 6: 255,
 7: 281,
 8: 2211,
 9: 2596,
 10: 1897,
 11: 909,
 12: 1236,
 13: 300,
 14: 116,
 15: 309,
 16: 54,
 17: 37,
 18: 41,
 19: 23,
 20: 30,
 21: 15,
 22: 5,
 23: 21,
 24: 11,
 25: 5,
 26: 14,
 27: 6,
 28: 4,
 29: 2,
 30: 11,
 31: 3,
 32: 1,
 33: 4,
 34: 13,
 35: 35,
 36: 12,
 37: 1,
 38: 12,
 39: 3,
 40: 1,
 42: 1,
 49: 1,
 50: 1,
 51: 1,
 52: 1,
 55: 1,
 64: 1,
 65: 4,
 66: 1,
 70: 1,
 71: 2,
 72: 5,
 73: 1,
 74: 12,
 75: 24,
 87: 1,
 95: 1,
 98: 49,
 101: 1,
 102: 10,
 105: 48,
 161: 1}

In [399]:
[i['uuid'] for i in data if 'responsiblePartyInfo' in i and isinstance(i['responsiblePartyInfo'], list) and len(i['responsiblePartyInfo']) == 105]

['6a42c000c2ad4532b74c08b16a33992d',
 'fdf588322e3b4ba5a083a822e1b5ebbd',
 '41da8d4e2a2843049c4ec7d6b52a23c8',
 'a66ea4a45396463fbaceb5dd4b1886b5',
 '976fa74c1f3444d19133840bcf6b320f',
 'd6e0c02eb7d546738d6620aa05893d5d',
 '590f273c0ab14033ac50bd2f0706de8d',
 '8c135f786d904e45a0f5f7204675db73',
 '39279a787f69427d815192ca6458582e',
 '27813c7c87894902bdb672a5fc16cc1f',
 'd2d093f4afe54cf49da199db83dbc51e',
 'c20d07b4e692494f919382310e43b225',
 'dc567372a8424c3097988d60db7cf06c',
 'ef9dc7705e0648f8b6c1f23cf0ef8ed1',
 'd1535aa7a15f43f9a03642ca26f92c51',
 '376553085b61406b91a80ccc7fcd6bb4',
 'de71cf74fb504bb9bb8008de4aa8b95f',
 'b7952776ba5546fdb2a15104a6f382b0',
 '14ca2574a76e4b2d843e94c5c7be6be4',
 '2fddc05e8b224385a147073a62f26d4b',
 '6d4ba43b7596451192298dd566cb1927',
 'b91e9e2f056c4046b587fb71c5da44ba',
 '229b7a8ccb8e43839422252048eb9958',
 'c8b198d4c94e48fa921234848b40f7f7',
 '8f791a3acdaf4308974562960e18421b',
 'de16a3a97ce1424bb8b5c7a704692c71',
 '246db47c9e104710a43262b99f553490',
 

In [381]:
item = get_by_uuid(data, '42a4ab662c734acfa46d5a1b231364ad')

In [385]:
[type(i) for i in items]

[dict,
 dict,
 dict,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,
 list,

In [2]:
data = json_to_dict('fixture1.5.json.gz')
data[:10]

[{'model': 'cedamoles_app.referenceable',
  'pk': 1,
  'fields': {'uuid': '3b1a86cc61824d78ce195dc21b661c74',
   'short_code': 'coll'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 4,
  'fields': {'uuid': 'fab53ee460e05f1b68e23657f4b6c5f4',
   'short_code': 'proj'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 6,
  'fields': {'uuid': 'edbc618730c043a383b8fa9b8200cfb6',
   'short_code': 'plat'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 7,
  'fields': {'uuid': 'c7fa005e2095425392b18adbd7b40617',
   'short_code': 'instr'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 8,
  'fields': {'uuid': '110154772fc04bdf8029022b40e19521',
   'short_code': 'mpop'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 9,
  'fields': {'uuid': '0236ade7cd5f4986bb7f1722801ff204', 'short_code': 'acq'}},
 {'model': 'cedamoles_app.referenceable',
  'pk': 11,
  'fields': {'uuid': '7e23b82ec3bdc8e5297c0b623697c559', 'short_code': 'ob'}},
 {'model': 'cedamoles_app.referenceable',
  'pk'

In [9]:
data[52000:52010]

[{'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14017,
  'fields': {'priority': 1,
   'party': 19,
   'role': 'ceda_officer',
   'relatedTo': 3165}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14018,
  'fields': {'priority': 1,
   'party': 9,
   'role': 'operator',
   'relatedTo': 3165}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14019,
  'fields': {'priority': 1,
   'party': 1,
   'role': 'publisher',
   'relatedTo': 3167}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14020,
  'fields': {'priority': 1,
   'party': 1,
   'role': 'custodian',
   'relatedTo': 3167}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14021,
  'fields': {'priority': 1,
   'party': 1,
   'role': 'distributor',
   'relatedTo': 3167}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 14022,
  'fields': {'priority': 1,
   'party': 1,
   'role': 'point_of_contact',
   'relatedTo': 3167}},
 {'model': 'cedamoles_app.responsiblepartyinfo',
  'pk': 1