Skip to content

Commit

Permalink
First pass at extract
Browse files Browse the repository at this point in the history
  • Loading branch information
Brendan J. Herger committed Oct 20, 2017
1 parent 0032f5a commit fc64cf0
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 6 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Expand Up @@ -5,4 +5,8 @@
.idea/*

# Compiled python files
*.pyc
*.pyc

# Configuration files
confs/*.yaml
confs/*.yml
80 changes: 80 additions & 0 deletions bin/lib.py
@@ -0,0 +1,80 @@
"""
coding=utf-8
"""
import logging

import os
import pandas
import yaml

CONFS = None

AVAILABLE_EXTENSIONS = {'.csv', '.doc', '.docx', '.eml', '.epub', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.json',
'.log', '.mp3', '.msg', '.odt', '.ogg', '.pdf', '.png', '.pptx', '.ps', '.psv', '.rtf', '.tff',
'.tif', '.tiff', '.tsv', '.txt', '.wav', '.xls', '.xlsx'}


def load_confs(confs_path='../confs/confs.yaml'):
# TODO Docstring
global CONFS

if CONFS is None:
try:
CONFS = yaml.load(open(confs_path))
except IOError:
confs_template_path = confs_path + '.template'
logging.warn(
'Confs path: {} does not exist. Attempting to load confs template, from path: {}'.format(confs_path,
confs_template_path))
CONFS = yaml.load(open(confs_template_path))
return CONFS


def get_conf(conf_name):
return load_confs()[conf_name]

def archive_dataset_schemas(step_name, local_dict, global_dict):
"""
Archive the schema for all available Pandas DataFrames
- Determine which objects in namespace are Pandas DataFrames
- Pull schema for all available Pandas DataFrames
- Write schemas to file
:param step_name: The name of the current operation (e.g. `extract`, `transform`, `model` or `load`
:param local_dict: A dictionary containing mappings from variable name to objects. This is usually generated by
calling `locals`
:type local_dict: dict
:param global_dict: A dictionary containing mappings from variable name to objects. This is usually generated by
calling `globals`
:type global_dict: dict
:return: None
:rtype: None
"""
logging.info('Archiving data set schema(s) for step name: {}'.format(step_name))

# Reference variables
data_schema_dir = get_conf('data_schema_dir')
schema_output_path = os.path.join(data_schema_dir, step_name + '.csv')
schema_agg = list()

env_variables = dict()
env_variables.update(local_dict)
env_variables.update(global_dict)

# Filter down to Pandas DataFrames
data_sets = filter(lambda (k, v): type(v) == pandas.DataFrame, env_variables.iteritems())
data_sets = dict(data_sets)

for (data_set_name, data_set) in data_sets.iteritems():
# Extract variable names
logging.info('Working data_set: {}'.format(data_set_name))

local_schema_df = pandas.DataFrame(data_set.dtypes, columns=['type'])
local_schema_df['data_set'] = data_set_name

schema_agg.append(local_schema_df)

# Aggregate schema list into one data frame
agg_schema_df = pandas.concat(schema_agg)

# Write to file
agg_schema_df.to_csv(schema_output_path, index_label='variable')
31 changes: 26 additions & 5 deletions bin/main.py
Expand Up @@ -7,7 +7,12 @@
"""
import logging

import os

import pandas
import textract

import lib


def main():
Expand All @@ -33,16 +38,32 @@ def extract():
# TODO Docstring
logging.info('Begin extract')

# TODO Create list of candidate files
# Reference variables
candidate_file_agg = list()

# Create list of candidate files
for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):

# TODO Subset candidate files to supported extensions
folder_files = map(lambda x: os.path.join(root, x), files)
candidate_file_agg.extend(folder_files)

# TODO Attempt to extract text from files
# Convert list to a pandas DataFrame
observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path'])
logging.info('Found {} candidate files'.format(len(observations.index)))

# TODO Archive schema and return
# Subset candidate files to supported extensions
observations['extension'] = observations['file_path'].apply(lambda x: os.path.splitext(x)[1])
observations = observations[observations['extension'].isin(lib.AVAILABLE_EXTENSIONS)]
logging.info('Subset candidate files to extensions w/ available parsers. {} files remain'.
format(len(observations.index)))

# Attempt to extract text from files
observations['text'] = observations['file_path'].apply(textract.process)

# Archive schema and return
lib.archive_dataset_schemas('extract', locals(), globals())
logging.info('End extract')
pass
return observations

def transform():
pass
Expand Down
2 changes: 2 additions & 0 deletions confs/confs.yaml.template
@@ -0,0 +1,2 @@
resume_directory: ../data/input/example_resumes
data_schema_dir: ../data/schema
Binary file not shown.
Binary file not shown.
Binary file added data/input/example_resumes/SGresume-1.pdf
Binary file not shown.
Empty file.
Binary file added data/input/example_resumes/john_smith.docx
Binary file not shown.
Binary file added data/input/example_resumes/resume_Meyer.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions data/schema/extract.csv
@@ -0,0 +1,4 @@
variable,type,data_set
file_path,object,observations
extension,object,observations
text,object,observations

0 comments on commit fc64cf0

Please sign in to comment.