First pass at extract

bjherger · Oct 20, 2017 · fc64cf0 · fc64cf0
1 parent 0032f5a
commit fc64cf0
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,8 @@
 .idea/*
 
 # Compiled python files
-*.pyc
+*.pyc
+
+# Configuration files
+confs/*.yaml
+confs/*.yml
diff --git a/bin/lib.py b/bin/lib.py
@@ -0,0 +1,80 @@
+"""
+coding=utf-8
+"""
+import logging
+
+import os
+import pandas
+import yaml
+
+CONFS = None
+
+AVAILABLE_EXTENSIONS = {'.csv', '.doc', '.docx', '.eml', '.epub', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.json',
+                        '.log', '.mp3', '.msg', '.odt', '.ogg', '.pdf', '.png', '.pptx', '.ps', '.psv', '.rtf', '.tff',
+                        '.tif', '.tiff', '.tsv', '.txt', '.wav', '.xls', '.xlsx'}
+
+
+def load_confs(confs_path='../confs/confs.yaml'):
+    # TODO Docstring
+    global CONFS
+
+    if CONFS is None:
+        try:
+            CONFS = yaml.load(open(confs_path))
+        except IOError:
+            confs_template_path = confs_path + '.template'
+            logging.warn(
+                'Confs path: {} does not exist. Attempting to load confs template, from path: {}'.format(confs_path,
+                                                                                                         confs_template_path))
+            CONFS = yaml.load(open(confs_template_path))
+    return CONFS
+
+
+def get_conf(conf_name):
+    return load_confs()[conf_name]
+
+def archive_dataset_schemas(step_name, local_dict, global_dict):
+    """
+    Archive the schema for all available Pandas DataFrames
+     - Determine which objects in namespace are Pandas DataFrames
+     - Pull schema for all available Pandas DataFrames
+     - Write schemas to file
+    :param step_name: The name of the current operation (e.g. `extract`, `transform`, `model` or `load`
+    :param local_dict: A dictionary containing mappings from variable name to objects. This is usually generated by
+    calling `locals`
+    :type local_dict: dict
+    :param global_dict: A dictionary containing mappings from variable name to objects. This is usually generated by
+    calling `globals`
+    :type global_dict: dict
+    :return: None
+    :rtype: None
+    """
+    logging.info('Archiving data set schema(s) for step name: {}'.format(step_name))
+
+    # Reference variables
+    data_schema_dir = get_conf('data_schema_dir')
+    schema_output_path = os.path.join(data_schema_dir, step_name + '.csv')
+    schema_agg = list()
+
+    env_variables = dict()
+    env_variables.update(local_dict)
+    env_variables.update(global_dict)
+
+    # Filter down to Pandas DataFrames
+    data_sets = filter(lambda (k, v): type(v) == pandas.DataFrame, env_variables.iteritems())
+    data_sets = dict(data_sets)
+
+    for (data_set_name, data_set) in data_sets.iteritems():
+        # Extract variable names
+        logging.info('Working data_set: {}'.format(data_set_name))
+
+        local_schema_df = pandas.DataFrame(data_set.dtypes, columns=['type'])
+        local_schema_df['data_set'] = data_set_name
+
+        schema_agg.append(local_schema_df)
+
+    # Aggregate schema list into one data frame
+    agg_schema_df = pandas.concat(schema_agg)
+
+    # Write to file
+    agg_schema_df.to_csv(schema_output_path, index_label='variable')
diff --git a/bin/main.py b/bin/main.py
@@ -7,7 +7,12 @@
 """
 import logging
 
+import os
 
+import pandas
+import textract
+
+import lib
 
 
 def main():
@@ -33,16 +38,32 @@ def extract():
     # TODO Docstring
     logging.info('Begin extract')
 
-    # TODO Create list of candidate files
+    # Reference variables
+    candidate_file_agg = list()
+
+    # Create list of candidate files
+    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
 
-    # TODO Subset candidate files to supported extensions
+        folder_files = map(lambda x: os.path.join(root, x), files)
+        candidate_file_agg.extend(folder_files)
 
-    # TODO Attempt to extract text from files
+    # Convert list to a pandas DataFrame
+    observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path'])
+    logging.info('Found {} candidate files'.format(len(observations.index)))
 
-    # TODO Archive schema and return
+    # Subset candidate files to supported extensions
+    observations['extension'] = observations['file_path'].apply(lambda x: os.path.splitext(x)[1])
+    observations = observations[observations['extension'].isin(lib.AVAILABLE_EXTENSIONS)]
+    logging.info('Subset candidate files to extensions w/ available parsers. {} files remain'.
+                 format(len(observations.index)))
 
+    # Attempt to extract text from files
+    observations['text'] = observations['file_path'].apply(textract.process)
+
+    # Archive schema and return
+    lib.archive_dataset_schemas('extract', locals(), globals())
     logging.info('End extract')
-    pass
+    return observations
 
 def transform():
     pass

diff --git a/confs/confs.yaml.template b/confs/confs.yaml.template
@@ -0,0 +1,2 @@
+resume_directory: ../data/input/example_resumes
+data_schema_dir: ../data/schema
diff --git a/data/input/example_resumes/Brendan_Herger_Resume.pdf b/data/input/example_resumes/Brendan_Herger_Resume.pdf
diff --git a/data/input/example_resumes/Layla_Martin_Resume.pdf b/data/input/example_resumes/Layla_Martin_Resume.pdf
diff --git a/data/input/example_resumes/SGresume-1.pdf b/data/input/example_resumes/SGresume-1.pdf
diff --git a/data/input/example_resumes/asdf.asf b/data/input/example_resumes/asdf.asf
diff --git a/data/input/example_resumes/john_smith.docx b/data/input/example_resumes/john_smith.docx
diff --git a/data/input/example_resumes/resume_Meyer.pdf b/data/input/example_resumes/resume_Meyer.pdf
diff --git a/data/schema/extract.csv b/data/schema/extract.csv
@@ -0,0 +1,4 @@
+variable,type,data_set
+file_path,object,observations
+extension,object,observations
+text,object,observations