Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Brendan J. Herger
committed
Oct 20, 2017
1 parent
0032f5a
commit fc64cf0
Showing
11 changed files
with
117 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,8 @@ | |
.idea/* | ||
|
||
# Compiled python files | ||
*.pyc | ||
*.pyc | ||
|
||
# Configuration files | ||
confs/*.yaml | ||
confs/*.yml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
coding=utf-8 | ||
""" | ||
import logging | ||
|
||
import os | ||
import pandas | ||
import yaml | ||
|
||
CONFS = None | ||
|
||
AVAILABLE_EXTENSIONS = {'.csv', '.doc', '.docx', '.eml', '.epub', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.json', | ||
'.log', '.mp3', '.msg', '.odt', '.ogg', '.pdf', '.png', '.pptx', '.ps', '.psv', '.rtf', '.tff', | ||
'.tif', '.tiff', '.tsv', '.txt', '.wav', '.xls', '.xlsx'} | ||
|
||
|
||
def load_confs(confs_path='../confs/confs.yaml'): | ||
# TODO Docstring | ||
global CONFS | ||
|
||
if CONFS is None: | ||
try: | ||
CONFS = yaml.load(open(confs_path)) | ||
except IOError: | ||
confs_template_path = confs_path + '.template' | ||
logging.warn( | ||
'Confs path: {} does not exist. Attempting to load confs template, from path: {}'.format(confs_path, | ||
confs_template_path)) | ||
CONFS = yaml.load(open(confs_template_path)) | ||
return CONFS | ||
|
||
|
||
def get_conf(conf_name): | ||
return load_confs()[conf_name] | ||
|
||
def archive_dataset_schemas(step_name, local_dict, global_dict): | ||
""" | ||
Archive the schema for all available Pandas DataFrames | ||
- Determine which objects in namespace are Pandas DataFrames | ||
- Pull schema for all available Pandas DataFrames | ||
- Write schemas to file | ||
:param step_name: The name of the current operation (e.g. `extract`, `transform`, `model` or `load` | ||
:param local_dict: A dictionary containing mappings from variable name to objects. This is usually generated by | ||
calling `locals` | ||
:type local_dict: dict | ||
:param global_dict: A dictionary containing mappings from variable name to objects. This is usually generated by | ||
calling `globals` | ||
:type global_dict: dict | ||
:return: None | ||
:rtype: None | ||
""" | ||
logging.info('Archiving data set schema(s) for step name: {}'.format(step_name)) | ||
|
||
# Reference variables | ||
data_schema_dir = get_conf('data_schema_dir') | ||
schema_output_path = os.path.join(data_schema_dir, step_name + '.csv') | ||
schema_agg = list() | ||
|
||
env_variables = dict() | ||
env_variables.update(local_dict) | ||
env_variables.update(global_dict) | ||
|
||
# Filter down to Pandas DataFrames | ||
data_sets = filter(lambda (k, v): type(v) == pandas.DataFrame, env_variables.iteritems()) | ||
data_sets = dict(data_sets) | ||
|
||
for (data_set_name, data_set) in data_sets.iteritems(): | ||
# Extract variable names | ||
logging.info('Working data_set: {}'.format(data_set_name)) | ||
|
||
local_schema_df = pandas.DataFrame(data_set.dtypes, columns=['type']) | ||
local_schema_df['data_set'] = data_set_name | ||
|
||
schema_agg.append(local_schema_df) | ||
|
||
# Aggregate schema list into one data frame | ||
agg_schema_df = pandas.concat(schema_agg) | ||
|
||
# Write to file | ||
agg_schema_df.to_csv(schema_output_path, index_label='variable') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
resume_directory: ../data/input/example_resumes | ||
data_schema_dir: ../data/schema |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
variable,type,data_set | ||
file_path,object,observations | ||
extension,object,observations | ||
text,object,observations |