-
Notifications
You must be signed in to change notification settings - Fork 865
/
deserialize.py
157 lines (125 loc) · 5.82 KB
/
deserialize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import json
import os
import pandas as pd
from featuretools.entityset.relationship import Relationship
from featuretools.entityset.serialize import FORMATS
from featuretools.variable_types.variable import find_variable_types
def description_to_variable(description, entity=None):
'''Deserialize variable from variable description.
Args:
description (dict) : Description of :class:`.Variable`.
entity (Entity) : Instance of :class:`.Entity` to add :class:`.Variable`. If entity is None, :class:`.Variable` will not be instantiated.
Returns:
variable (Variable) : Returns :class:`.Variable`.
'''
variable_types = find_variable_types()
is_type_string = isinstance(description['type'], str)
type = description['type'] if is_type_string else description['type'].pop('value')
variable = variable_types.get(type, variable_types.get('None')) # 'None' will return the Unknown variable type
if entity is not None:
kwargs = {} if is_type_string else description['type']
variable = variable(description['id'], entity, **kwargs)
variable.interesting_values = description['properties']['interesting_values']
return variable
def description_to_entity(description, entityset, path=None):
'''Deserialize entity from entity description and add to entityset.
Args:
description (dict) : Description of :class:`.Entity`.
entityset (EntitySet) : Instance of :class:`.EntitySet` to add :class:`.Entity`.
path (str) : Root directory to serialized entityset.
'''
if path:
dataframe = read_entity_data(description, path=path)
else:
dataframe = empty_dataframe(description)
variable_types = {variable['id']: description_to_variable(variable) for variable in description['variables']}
entityset.entity_from_dataframe(
description['id'],
dataframe,
index=description.get('index'),
time_index=description.get('time_index'),
secondary_time_index=description['properties'].get('secondary_time_index'),
variable_types=variable_types)
def description_to_entityset(description, **kwargs):
'''Deserialize entityset from data description.
Args:
description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.
Returns:
entityset (EntitySet) : Instance of :class:`.EntitySet`.
'''
from featuretools.entityset import EntitySet
# If data description was not read from disk, path is None.
path = description.get('path')
entityset = EntitySet(description['id'])
last_time_index = []
for entity in description['entities'].values():
entity['loading_info']['params'].update(kwargs)
# If path is None, an empty dataframe will be created for entity.
description_to_entity(entity, entityset, path=path)
if entity['properties']['last_time_index']:
last_time_index.append(entity['id'])
for relationship in description['relationships']:
relationship = Relationship.from_dictionary(relationship, entityset)
entityset.add_relationship(relationship)
if len(last_time_index):
entityset.add_last_time_indexes(updated_entities=last_time_index)
return entityset
def empty_dataframe(description):
'''Deserialize empty dataframe from entity description.
Args:
description (dict) : Description of :class:`.Entity`.
Returns:
df (DataFrame) : Empty dataframe for entity.
'''
columns = [variable['id'] for variable in description['variables']]
dtypes = description['loading_info']['properties']['dtypes']
return pd.DataFrame(columns=columns).astype(dtypes)
def read_entity_data(description, path):
'''Read description data from disk.
Args:
description (dict) : Description of :class:`.Entity`.
path (str): Location on disk to read entity data.
Returns:
df (DataFrame) : Instance of dataframe.
'''
file = os.path.join(path, description['loading_info']['location'])
kwargs = description['loading_info'].get('params', {})
if description['loading_info']['type'] == 'csv':
dataframe = pd.read_csv(
file,
engine=kwargs['engine'],
compression=kwargs['compression'],
encoding=kwargs['encoding'],
)
elif description['loading_info']['type'] == 'parquet':
dataframe = pd.read_parquet(file, engine=kwargs['engine'])
elif description['loading_info']['type'] == 'pickle':
dataframe = pd.read_pickle(file, **kwargs)
else:
error = 'must be one of the following formats: {}'
raise ValueError(error.format(', '.join(FORMATS)))
dtypes = description['loading_info']['properties']['dtypes']
return dataframe.astype(dtypes)
def read_data_description(path):
'''Read data description from disk.
Args:
path (str): Location on disk to read `data_description.json`.
Returns:
description (dict) : Description of :class:`.EntitySet`.
'''
path = os.path.abspath(path)
assert os.path.exists(path), '"{}" does not exist'.format(path)
file = os.path.join(path, 'data_description.json')
with open(file, 'r') as file:
description = json.load(file)
description['path'] = path
return description
def read_entityset(path, **kwargs):
'''Read entityset from disk.
Args:
path (str): Directory on disk to read `data_description.json`.
kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method.
'''
data_description = read_data_description(path)
return description_to_entityset(data_description, **kwargs)