In [27]:
from pprint import pprint
from frictionless import Schema, fields, describe, validate, Resource
import os
import pandas as pd
from ruamel.yaml import YAML
import re

In [28]:
schema_list=[]
output_dict={}
directory = "acre_plant/all_plant" # Change based on operation

In [29]:
# Function to add missing value section to the generated schema file
# It can be used to modify field names. For example, pandas read duplicate
# field names as Field.1 but frictionless.describe reads  duplicate field
#  as Field2

def append_missing(yaml_file_path):
    yaml = YAML()
    missing_values = ['', 'n/a', 'nan', '<NA>', 'N/A']
    
    # Read the existing YAML file
    with open(yaml_file_path, 'r') as file:
        data = yaml.load(file)

    # Automatically detect duplicate fields
    for field in data.get('fields', []):
        if re.match(r'.*\d$', field['name']):
            # Change the field name to the new format
            base_name = re.sub(r'\d$', '', field['name'])
            field['name'] = f"{base_name}.1"

    # Append the missing values to the 'missingValues' key
    if 'missingValues' in data:
        data['missingValues'].extend(missing_values)
    else:
        data['missingValues'] = missing_values

    # Write the updated data back to the YAML file
    with open(yaml_file_path, 'w') as file:
        yaml.dump(data, file)



In [30]:
def generate_schema(file_path, schema_count):
    schema_name=directory+'.'+str(schema_count)+'.schema'+'.yaml'
    schema = Schema.describe(file_path)
    schema.to_yaml(schema_name) 
    append_missing(schema_name) # Add missing values
    schema_list.append(schema_name) # Add new schema to list of shemas

In [31]:
# Validate file with all schema. 
def loop_schema(df, file_count, file_path, schema_found):
    for schema_file in schema_list:
        schema = Schema.from_descriptor(schema_file)
        try:
            df=df[schema.field_names]
            report = validate(df, schema=schema,limit_errors=2)
            if (report.valid == True):
                
                print("{}. {} = {}".format(file_count+1,file_path,report.valid))

                schema_found=True

                if schema_file not in output_dict:
                    output_dict[schema_file] = [file_path]
                else:
                    output_dict[schema_file].append(file_path)

                break
        except:
            pass
    return schema_found

In [32]:

def loop_through_files(directory, limit=None):

    if not os.path.isdir(directory):
        raise ValueError(f"Error: '{directory}' is not a valid directory.")

    
    file_count = schema_count = 0
    
    file_names = os.listdir(directory)
    if(file_names):
        file_names.sort(reverse=True)

    print('Length of files in directory: ',len(file_names))

    for count, filename in enumerate(file_names):
        if filename.startswith('.'):
            continue  # Skip hidden files
        if filename.endswith('.yaml'):
            continue  # Skip schema
        if filename.endswith('.ipynb'):
            continue  # Skip ipynb files
        schema_found=False

        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):  
            
            # Generate schema using the first file 
            if count==0:
                generate_schema(file_path, schema_count)
                schema_count+=1

            # Open csv file as dataframe
            df = pd.read_csv(file_path, encoding='mbcs') 

            # loop through all schema and validate file
            schema_found=loop_schema(df, file_count, file_path, schema_found)

            # Create new schema if file fails validation
            if schema_found==False:
                generate_schema(file_path, schema_count)
                schema_count+=1
                # Validate
                loop_schema(df, file_count, file_path, schema_found)

loop_through_files(directory)

Length of files in directory:  201
1. acre_plant/all_plant\Purdue_ACRE_test_2018_NH3_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_test_2018_Beck5337sx_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_test_2015_GL5939VT3_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_WQFS_2018_dkc 63 60_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_WQFS_2015_B5131AMXT_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_Trailer ISO_2018_NH3_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_Trailer ISO_2018_Dkc 53-56_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_92-94_2016__1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_cz3548LL_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_becks 6368_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_b366LL_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_NH3_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_DKC 63-33_1.csv = True
1. acre_plant/all_plant\Purdue_ACRE_200_2018_BECK6368_1.csv = True
1. acre_plant/all_pl

In [33]:
# print(output_dict)
for key, value in output_dict.items():
    print(f"{key}: {len(value)}")

output_fn=directory+'_output.txt'
with open(output_fn, 'w') as f:
    f.write(str(output_dict))

acre_plant/all_plant.0.schema.yaml: 13
acre_plant/all_plant.1.schema.yaml: 188
