In [1]:
%%writefile config.yaml
#== contains input/output parameters that can be tweaked
#Incoming file
file_type: csv
dataset_name: cancerfile
file_name: cancerdata
inbound_delimiter: ","
skip_leading_rows: 1


#outbound file    
outbound_delimiter: "|"
columns:
    - patient
    - a1
    - a2
    - a3
    - a4
    - a5
    - a6
    - a7
    - a8
    - a9
    - a10
    - diagnosis

Overwriting config.yaml


In [2]:
%%writefile utility.py
#== contains generic functions that are repetitively used
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

#read in file
def read_config_file(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

#check whether special characters have been repeated twice in column name
#replace with only a single character
def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string
 
def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0            

Overwriting utility.py


In [3]:
# Read config file
import utility as util
config_data = util.read_config_file('config.yaml')

In [4]:
# Inspect parameters
print(config_data['inbound_delimiter'])
print(config_data['columns'])
print(config_data)

,
['patient', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'diagnosis']
{'file_type': 'csv', 'dataset_name': 'cancerfile', 'file_name': 'cancerdata', 'inbound_delimiter': ',', 'skip_leading_rows': 1, 'outbound_delimiter': '|', 'columns': ['patient', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'diagnosis']}


In [5]:
# Read the file using config file
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + '.' + file_type
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

Unnamed: 0,patient,a1,a2,a3,a4,a5,a6,a7,a8,a9,diagnosis
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [6]:
# Validate the header of the file
util.col_header_val(df,config_data)

column name and column length validation failed
Following File columns are not in the YAML file []
Following YAML columns are not in the file uploaded ['a10']


0

In [7]:
# Inspect missing columns
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['patient', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9',
       'diagnosis'],
      dtype='object')
columns of YAML are: ['patient', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'diagnosis']


In [8]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine
    print("Proceeding to exploratory data analysis")

column name and column length validation failed
Following File columns are not in the YAML file []
Following YAML columns are not in the file uploaded ['a10']
validation failed
