# Analyze of Moles DB fixture

## Save load

In [19]:
import json
import gzip

# loads json file to dictionary
def json_to_dict(filename):
    with gzip.open(filename, 'r') as f:
        return json.loads(f.read().decode('utf-8'))

# saves dictionary to json file
def dict_to_json(data, filename):
    with gzip.open(filename, 'w') as f:
        f.write(json.dumps(data).encode('utf-8'))   

## Flattening fixture

### Refereanceable

In [20]:
# returns model's name
def get_model_name(obj):
    return obj['model'].split('.')[1]

In [21]:
# maps shortcode to model name

map = {
    'acq': 'acquisition',
    'cmppr': 'compositeprocess',
    'coll': 'observationcollection',
    'comp': 'computation',
    'instr': 'instrument',
    'mpop': 'mobileplatformoperation',
    'ob': 'observation',
    'plat': 'platform',
    'proj': 'project',
    'result': 'result',
    'excit': 'externalcitation',
    }
def map_shortcode_to_model_name(short_code, full=False):
    if full:
        return f'cedamoles_app.{map[short_code]}'
    return map[short_code]

In [22]:
# returns dict of referenceable objects where keys are PKs and values are fields of those objects

def get_referenceable_dict_from_list(ref_list):
    output = dict()
    for i in ref_list:
        output[i['pk']] =  i['fields']
    return output

In [23]:
# inserts UUIDs into fields of corresponding models 

def include_referenceable(data):
    output = []
    referenceable = [i for i in data if i['model'] == 'cedamoles_app.referenceable']
    referenceable = get_referenceable_dict_from_list(referenceable)
    data = [i for i in data if i['model'] != 'cedamoles_app.referenceable']
    
    for i in data:
        pk = i['pk']
        if pk in referenceable and i['model'] == map_shortcode_to_model_name(referenceable[pk]['short_code'] , True):
            my_obj = i
            my_obj['fields']['uuid'] = referenceable[pk]['uuid']
            output.append(my_obj)
        else:
            output.append(i)
            
    return output


### Inserting models into models

In [24]:
# e.g. if observation has a filed phenomena which refers to the phenomena model by its PK, that model will be inserted into the observation

def include_simple_field(data, model_name, model_and_field_pairs_to_insert_model):
    values = {i['pk']: i['fields'] for i in data if i['model'] == f'cedamoles_app.{model_name}'}
    data = [i for i in data if i['model'] != f'cedamoles_app.{model_name}']

    for i in data:
        for m, f in model_and_field_pairs_to_insert_model:
            if i['model'] == f'cedamoles_app.{m}' and f in i['fields'] and i['fields'][f]:
                if isinstance(i['fields'][f], list):
                    i['fields'][f] = [values[j] for j in i['fields'][f]]
                else:
                    i['fields'][f] = values[i['fields'][f]]
            
    return data

In [25]:
# e.g. if relation between 2 models is done via foreign key (identifiers and observations) then fields from non-referenceable will be added to the referenceable

def include_on_foreign_key(data, model_name, new_field_name=''):
    output = []
    ids = [i for i in data if i['model'] == f'cedamoles_app.{model_name}']
    ids = {i['pk']: i['fields'] for i in ids}
    data = [i for i in data if i['model'] != f'cedamoles_app.{model_name}']

    for i in data:
        if 'uuid' in i['fields'] and i['pk'] in ids:
            fields = {k: v for k, v in ids[i['pk']].items() if k != 'relatedTo' and k !='ob_ref'}
            i['fields'][new_field_name if new_field_name else model_name] = fields
        
        output.append(i)
    
    return output

In [26]:
# this fields is unique 

def include_related_obs_info(data):
    output = []
    ids = [i for i in data if i['model'] == 'cedamoles_app.relatedobservationinfo']
    ids = {i['fields']['objectObservation']: i['fields'] for i in ids}
    data = [i for i in data if i['model'] != 'cedamoles_app.relatedobservationinfo']

    for i in data:
        if i['pk'] in ids:
            fields = {k: v for k, v in ids[i['pk']].items() if k != 'objectObservation'}
            i['fields']['relatedObservationInfo'] = fields
        
        output.append(i)
    
    return output

### Creating relations and removing PKs

In [27]:
# maps PK to UUID

my_fixture = json_to_dict('fixture2.json.gz')
my_map = {i['pk']: i['fields']['uuid'] for i in my_fixture}

def map_pk_to_uuid(pk):
    try:
        return my_map[pk]
    except KeyError:
        return ''

In [30]:
def replace_pks(obj, field):
    if not field and isinstance(obj, int):
        return map_pk_to_uuid(obj)

    if isinstance(obj, list):
        return [replace_pks(i, field) for i in obj]

    if not field or field[0] not in obj:
        return obj
    
    obj[field[0]] = replace_pks(obj[field[0]], field[1:])

    return obj
    

In [60]:
# replaces PHs with UUIDs

list_of_fields = ['independentInstrument',
                'mobilePlatformOperation',
                'computationComponent',
                'acquisitionComponent',
                'subInstrument',
                'platform_field',
                'result_field',
                'procedureAcquisition',
                'procedureComputation',
                'procedureCompositeProcess',
                'projects',
                'member',
                'parentProject',
                'observationCollection',
                'relatedObservationInfo/subjectObservation',
                'instrumentPlatformPair/instrument',
                'instrumentPlatformPair/platform',
                'note/commentator',
                'note/relatedRecord',
                'reviewNote/commentator',
                'softwareReference',
                'childPlatform',
                'oldDataPath'
                ]

def swap_pks_to_uuids(data):
    output = []
    for i in data:
        for f in list_of_fields:
            f = f.split('/')
            i['fields'] = replace_pks(i['fields'], f)
        
        output.append(i)
        
    return output

In [33]:
# rearrange structure of records by removing PKs and bringing fields to the upper level

def remove_pks(data):
    output = []
    for i in data:
        model = i['model'].split('.')[1]
        i = i['fields']
        i['model'] = model
        output.append(i)
    
    return output

In [94]:
# removes specified field from dictionary recursively

def remove_from_dict(data, field_path):
    if data is None: 
        return
        
    field_path = field_path.split('/')

    if field_path[0] not in data:
        return data

    if not field_path[1:]:
        return {k: v for k, v in data.items() if k != field_path[0]}

    data[field_path[0]] = remove_from_dict(data[field_path[0]], '/'.join(field_path[1:]))
    
    return data
  
    

In [95]:
# removes specified field of one of the models from dictionary

def remove_field_from_model(data, model_name, field_path):
    output = []
 
    for i in data:
        try:
            if i['model'] == model_name:
                i = remove_from_dict(i, field_path)
            output.append(i)
        except:
            print(f'Problem with path {field_path}. i = {i}')
    return output

In [96]:
# takes list of model-fieldpath pairs, removes them from the fixture and saves result to the new one

def remove_fields_from_models_in_fixture(list_of_model_fieldpath_pairs, file_in = 'fixture3.json.gz', file_out = 'fixture4.json.gz'):
    data = json_to_dict(file_in)
    
    for m, f in list_of_model_fieldpath_pairs:
        data = remove_field_from_model(data, m, f)
        

    dict_to_json(data, file_out)


### Analyzing non empty values

In [97]:
def count_not_nulls_helper(path, obj, out_dict):
    if (obj != 0 and not obj) or obj is None:
        out_dict[path] = out_dict.get(path, 0)
        return
    
    out_dict[path] = out_dict.get(path, 0) + 1

    if isinstance(obj, dict):
        for k, v in obj.items():
            count_not_nulls_helper(f'{path}/{k}', v, out_dict)


In [98]:
# counts how many non empty/non None values of certain field appeared in the database

def count_not_nulls(grouped_data):
    output = dict()

    for m, l in grouped_data.items():
        for f in l:
            count_not_nulls_helper(m, f, output)
    
    return output

In [16]:
# groups records by model

def group_by_model(data):
    output = dict()
    models = set([i['model'] for i in data])

    for m in models:
        output[m] = []
    
    for i in data:
        output[i['model']].append(i)
    
    return output

In [112]:
# saves distribution of values to the txt file

def save_distribution_to_the_file(data, filename='distribution.txt', group_by_count = False):
    
    dicted_data = group_by_model(data)
    distribution = count_not_nulls(dicted_data)

    if not group_by_count:
        with open(filename, 'w') as f:
            k1 = ''
            for k, v in distribution.items():
                model = k.split('/')[0]

                if k1 != model:
                    f.write('\n')

                k1 = model
                f.write(f'{k}: {v}\n')
        return

    
    result = dict()

    for k, v in distribution.items():
        if v in result:
            result[v].append(k)
        else:
            result[v] = [k]
    
    result = dict(sorted(result.items()))

    filename = filename.split('.')[0]
    with open(f'{filename}_grouped.txt', 'w') as f:
        for k,v in result.items():
            f.write(f'{k}:\n')
            for i in v:
                f.write(f'\t{i}\n')

### Removing fields

In [101]:
# removes specified field from dictionary recursively

def remove_from_dict(data, field_path):
    if data is None: 
        return
        
    field_path = field_path.split('/')

    if field_path[0] not in data:
        return data

    if not field_path[1:]:
        return {k: v for k, v in data.items() if k != field_path[0]}

    data[field_path[0]] = remove_from_dict(data[field_path[0]], '/'.join(field_path[1:]))
    
    return data
  

In [102]:
# removes specified field of one of the models from dictionary

def remove_field_from_model(data, model_name, field_path):
    output = []
 
    for i in data:
        try:
            if i['model'] == model_name:
                i = remove_from_dict(i, field_path)
            output.append(i)
        except:
            print(f'Problem with path {field_path}. i = {i}')
    return output

In [103]:
# takes list of model-fieldpath pairs, removes them from the fixture and saves result to the new one

def remove_fields_from_models_in_fixture(list_of_model_fieldpath_pairs, file_in = 'fixture3.json.gz', file_out = 'fixture4.json.gz'):
    data = json_to_dict(file_in)
    
    for m, f in list_of_model_fieldpath_pairs:
        data = remove_field_from_model(data, m, f)
        

    dict_to_json(data, file_out)

### Saving structure of fixture

In [36]:
# compress all the records under model to one record with non empty fields (if possible) which will be representative

def compress_groups(data_by_model):
    output = dict()
    for m, l in data_by_model.items():
        output[m] = dict()
        for i in l:
            for k, v in i.items():
                if k not in output[m] or not output[m][k]:
                    output[m][k] = v
    
    return output

In [37]:
# convert values to the type name

def convert_values_to_types(data):

    if isinstance(data, dict):
        for k, v in data.items():
            data[k] = convert_values_to_types(v)
        return data

    elif isinstance(data, list):
        if data:
            return f'[{convert_values_to_types(data[0])}]'
        return []

    if isinstance(data, str) and len(data) == 32:
        return 'uuid'

    return data.__class__.__name__   

In [38]:
# save dictionary in the yaml like format to make it readible

def save_formatted(obj, file, padding = ''):
    if isinstance(obj, list):
        for i in obj:
            save_formatted(i, file, padding)

        
    elif isinstance(obj, dict):
        for k, v in obj.items():
            file.write(f'{padding}{k}:\n')
            save_formatted(v, file, padding + '\t')
            if not padding:
                file.write('\n')
    
    else:
        file.write(f'{padding}{obj}\n')

In [43]:
# save structure of fixture to the file

def save_structure(data, suffix='', keep_values=False):
    data = group_by_model(data)
    data = compress_groups(data)
    if not keep_values:
        data = {k: convert_values_to_types(v) for k,v in data.items()}
    
    with open(f'structure{suffix}.txt', 'w') as f:
        save_formatted(data, f)

## Main script

### Fixture1.5 - tiding up

In [45]:
data = json_to_dict('fixture.json.gz')
data = [i for i in data if i['model'].split('.')[0] == 'cedamoles_app']
dict_to_json(data, 'fixture1.5.json.gz')

### Fixture2 - flat fixture

In [54]:
# after all the interation fixture is flatten down to the 10 refereanceable models and saved as fixture2.json

data = json_to_dict('fixture1.5.json.gz')
data = include_referenceable(data)
data = include_simple_field(data, 'discoveryserviceid', [('observation', 'discoveryKeywords'), ('observationcollection', 'discoveryKeywords')])
data = include_simple_field(data, 'dqconformanceresult', [('observation', 'resultQuality')])
data = include_simple_field(data, 'constraints', [('imagedetails', 'imageConstraints'),('observation', 'permission')])
data = include_simple_field(data, 'imagedetails', [('acquisition', 'imageDetails'), 
                                                  ('computation', 'imageDetails'),
                                                  ('instrument', 'imageDetails'),
                                                  ('observationcollection', 'imageDetails'),
                                                  ('observation', 'imageDetails'),
                                                  ('platform', 'imageDetails'),
                                                  ('project', 'imageDetails'),
                                                  ])
data = include_simple_field(data, 'vocabularyterm', [('observation', 'vocabularyKeywords')])
data = include_simple_field(data, 'verticalextent', [('observation', 'verticalExtent')])
data = include_simple_field(data, 'timeperiod', [('mobileplatformoperation', 'operationTime'),
                                                 ('observation', 'timePeriod'),
                                                 ('observation', 'validTimePeriod')])
data = include_simple_field(data, 'party', [('responsiblepartyinfo', 'party'),
                                            ('review', 'commentator'),
                                            ('review', 'reviewer')])
data = include_simple_field(data, 'phenomenonname', [('phenomenon', 'names')])
data = include_simple_field(data, 'phenomenonterm', [('phenomenon', 'terms')])
data = include_simple_field(data, 'phenomenon', [('observation', 'phenomena')])
data = include_simple_field(data, 'geographicboundingbox', [('mobileplatformoperation', 'location'),
                                                            ('observation', 'geographicExtent'),
                                                            ('platform', 'location')])

data = include_on_foreign_key(data, 'drsdataset', 'drsDataset')
data = include_on_foreign_key(data, 'identifier')
data = include_on_foreign_key(data, 'onlineresource', 'onlineResource')
data = include_on_foreign_key(data, 'migrationproperty', 'migrationProperty')
data = include_on_foreign_key(data, 'note')
data = include_on_foreign_key(data, 'responsiblepartyinfo', 'responsiblePartyInfo')
data = include_on_foreign_key(data, 'review')

data = include_simple_field(data, 'inputoutputdescription', [('computation', 'inputDescription'),
                                                            ('computation', 'outputDescription'),
                                                            ('acquisition', 'outputDescription')])
data = include_on_foreign_key(data, 'instrumentplatformpair', 'instrumentPlatformPair')
data = include_on_foreign_key(data, 'reviewnote', 'reviewNote')



data = include_related_obs_info(data)

dict_to_json(data, 'fixture2.json.gz')




### Fixture3 - relations via UUID; PKs removed

In [61]:
# relations between models are established by UUIDs; PKs got removed; fixture saved as fixture3.json

data = json_to_dict('fixture2.json.gz')
data = swap_pks_to_uuids(data)
data = remove_pks(data)
dict_to_json(data, 'fixture3.json.gz')
save_structure(data, '_3')

## Fixture3.5 - modifying geo extent to match the ES format

In [8]:
def modify_geo_extent(geo_extent):
    if geo_extent is None:
        return None
        
    output = {'type' : 'envelope',
            'coordinates': []}
    
    output['coordinates'] = [
        [geo_extent['westBoundLongitude'], 
        geo_extent['northBoundLatitude']], 
        [geo_extent['eastBoundLongitude'],
        geo_extent['southBoundLatitude']]]
    
    return output

In [10]:
data = json_to_dict('fixture3.json.gz')

for x, i in enumerate(data):
        if i['model'] == 'observation':
            i['geographicExtent'] = modify_geo_extent(i['geographicExtent'])
            data[x] = i
        
        if i['model'] in ['mobileplatformoperation', 'platform']:
            if 'location' in i:
                i['geographicExtent'] = modify_geo_extent(i['location'])
                i = {k: v for k, v in i.items() if k != 'location'}
                data[x] = i


dict_to_json(data, 'fixture3.5.json.gz')

### Fixture4 - nulls removed

In [64]:
# fields which haven't been used across the entire DB are removed from it; fixture saved as fixture4.json

data = json_to_dict('fixture3.json.gz')
grouped_data = group_by_model(data)
distribution = count_not_nulls(grouped_data)
distribution = {k: v for k, v in distribution.items() if v == 0}

fields_to_be_removed = []
with open('empty_fields.txt', 'w') as f:
    for i in distribution:
        f.write(f'{i}\n')
        sep = i.find('/')
        model = i[:sep]
        path = i[sep + 1:]
        fields_to_be_removed.append((model, path))

fields_to_be_removed
remove_fields_from_models_in_fixture(fields_to_be_removed, 'fixture3.json.gz', 'fixture4.json.gz')


In [66]:
data = json_to_dict('fixture4.json.gz')
save_structure(data, '_4')
save_distribution_to_the_file(data, 'distribution_4.txt')
save_distribution_to_the_file(data, 'distribution_4_grouped.txt', True)

### Fixture5 - irrelevant fields removed after the discussion

In [42]:
# any fields specified in the fields_to_be_removed are be removed from the fixture and new fixture is saved as fixture5.json

fields_to_be_removed = [
    ('result', 'review'),
]

remove_fields_from_models_in_fixture(fields_to_be_removed, 'fixture4.json.gz', 'fixture5.json.gz')

## Everything

### Analyzing sizes of fixtures

In [71]:
def count_elements(data):
    if not isinstance(data, dict) and not isinstance(data, list):
        return 0
    
    result = 0
    if isinstance(data, list):
        for i in data:
            result += count_elements(i)
        return result
    
    for k, v in data.items():
        result += 1
        result += count_elements(v)

    return result 

In [80]:
from pathlib import Path

def analyze_fixtures(filename):
    suffixes = ['1.5', '2', '3', '4',]
    with open(filename, 'w') as f:
        f.write('name\tsize[MB]\tnumber_of_models\tnumber_of_fields\n')
        for s in suffixes:
            fname = f'fixture{s}.json.gz'
            data = json_to_dict(fname)
            fsize = Path(fname).stat().st_size / 1000000
            num_of_models = len(data)
            num_of_fields = count_elements(data)
            f.write(f'{fname}\t{fsize}\t{num_of_models}\t{num_of_fields}\n')


### Analyzing deepness of records

In [126]:
def save_analysis_of_deepness(filein, fileout):
    data = json_to_dict(filein)
    data = group_by_model(data)
    data = count_not_nulls(data)   
    
    output = dict()
    for k in data:
        k1 = len(k.split('/')) - 1
        if k1 in output:
            output[k1].append(k)
        else:
            output[k1] = [k] 
    
    with open(fileout, 'w') as f:
        save_formatted(output, f)


### Unzipping fixture

In [14]:
def unzip_fixture(filename):
    data = json_to_dict(filename)

    newname = filename.split('.')
    newname = ".".join(newname[:-1])

    with open(newname, 'w') as f:
        json.dump(data, f, indent=4)

### Other

In [145]:
def is_datetime(value):
    if not value or not isinstance(value, str):
        return False
  
    try:
        if value[4] == '-' and value[7] == '-' and value[10] == 'T':
            return True
    except IndexError:
        return False

In [147]:
def get_date_fields(data, path=''):
    output = dict()

    if not data:
        return output

    if isinstance(data, str) and is_datetime(data):
        output[path] = data

    elif isinstance(data, list):
        output.update(get_date_fields(data[0], path))
    
    elif isinstance(data, dict):
        for k, v in data.items():
            output.update(get_date_fields(v, f'{path}/{k}'))
    
    return output
            