In [142]:
from pprint import pprint
from frictionless import Schema, fields, describe, validate, Resource
import os
import pandas as pd
from ruamel.yaml import YAML
import re
import shutil

In [143]:

output_dict={}
operation='plant' # Change based on operation
directory = "acre_plant/all_plant" # Change based on operation
valid_schema="valid_schema"
invalid_schema="invalid_schema"
valid_files="valid_files"

In [144]:
# Function to add missing value section to the generated schema file
# It can be used to modify field names. For example, pandas read duplicate
# field names as Field.1 but frictionless.describe reads  duplicate field
#  as Field2

def append_missing(yaml_file_path):
    yaml = YAML()
    missing_values = ['', 'n/a', 'nan', '<NA>', 'N/A']
    
    # Read the existing YAML file
    with open(yaml_file_path, 'r') as file:
        data = yaml.load(file)

    # Automatically detect duplicate fields
    for field in data.get('fields', []):
        if re.match(r'.*\d$', field['name']):
            # Change the field name to the new format
            base_name = re.sub(r'\d$', '', field['name'])
            field['name'] = f"{base_name}.1"

    # Append the missing values to the 'missingValues' key
    if 'missingValues' in data:
        data['missingValues'].extend(missing_values)
    else:
        data['missingValues'] = missing_values

    # Write the updated data back to the YAML file
    with open(yaml_file_path, 'w') as file:
        yaml.dump(data, file)



In [145]:
def generate_schema(invalid_schema_dir, file_path, schema_count):
    schema_name=invalid_schema_dir+'/'+operation+'.'+str(schema_count)+'.schema'+'.yaml'
    schema = Schema.describe(file_path)
    schema.to_yaml(schema_name) 
    append_missing(schema_name) # Add missing values

    
    # schema_list.append(schema_name) # Add new schema to list of shemas

In [146]:
# Validate file with all schema. 
def loop_schema(df, file_count, file_path, schema_found, schema_list):
    for schema_file in schema_list:
        schema = Schema.from_descriptor(schema_file)
        try:
            df=df[schema.field_names]
            report = validate(df, schema=schema,limit_errors=2)
            if (report.valid == True):
                print("{}. {} = {}".format(file_count+1,file_path,report.valid))
                schema_found=True
                break
        except:
            pass
    return schema_found

In [147]:

def loop_through_files(directory, limit=None):
    count=0
    valid_schema_list=[]
    if not os.path.isdir(directory):
        raise ValueError(f"Error: '{directory}' is not a valid directory.")

    
    file_count = 0
    
    file_names = os.listdir(directory)
    if(file_names):
        file_names.sort(reverse=True)

    print('Length of files in directory: ',len(file_names))

    for filename in file_names:
        if filename.startswith('.'):
            continue  # Skip hidden files
        if filename.endswith('.yaml'):
            continue  # Skip schema
        if filename.endswith('.ipynb'):
            continue  # Skip ipynb files
        schema_found=False
        if filename.startswith('.') or filename.startswith('.yaml') or filename.startswith('.ipynb'):
            count-=1

        file_path = os.path.join(directory, filename)

        if not os.path.isfile(file_path):
            print('No file to validate!')
            break

        if os.path.isfile(file_path):  
            
            
            # Path fo valid and invalid schema
            valid_schema_directory=directory+'/'+valid_schema
            invalid_schema_directory=directory+'/'+invalid_schema

            df = pd.read_csv(file_path, encoding='mbcs') 
            
            if not os.path.exists(invalid_schema_directory):
                os.makedirs(invalid_schema_directory)
                if not os.path.exists(valid_schema_directory):
                    os.makedirs(valid_schema_directory)
                
           
                if count==0:
                    generate_schema(invalid_schema_directory,file_path, 0)
                    print('New schema generated')
                break

            #  if invalid_schema directory exists but empty and the valid_scema directory is empty, create new schema
            elif (not os.listdir(invalid_schema_directory)) and (not os.listdir(valid_schema_directory)):
                generate_schema(invalid_schema_directory,file_path, 0)
                print('New schema generated')
                break

            # Check if there is a schema in the invalid_schema folder and prompt user to go validate schema 
            elif os.listdir(invalid_schema_directory):
                print('Verify schema in invalid directory before proceeding')
                break

            
            # If there is atleast a valid schema
            else:
                # Make a list of all valid schema and validate when the first file is parsed
                if os.path.exists(valid_schema_directory) and count==0:
                    # print('here')
                    all_schema = os.listdir(valid_schema_directory)
    
                    # Filter out directories and non-YAML files
                    all_valid_schema_list = [item for item in all_schema if os.path.isfile(os.path.join(valid_schema_directory, item)) and (item.endswith('.yaml'))]
                    # Add apropriate path to all schemal files
                    add_path = [valid_schema_directory+ '/' + file for file in all_valid_schema_list]
                    valid_schema_list=valid_schema_list+add_path
                
                    # loop through all schema and validate file
                    schema_found=loop_schema(df, file_count, file_path, schema_found, valid_schema_list)
                    
                # validate files is parsed
                elif os.path.exists(valid_schema_directory) and count!=0:
                    schema_found=loop_schema(df, file_count, file_path, schema_found, valid_schema_list)
                    
                #  Is this needed?
                else:
                    print('No valid schema')
                    break

            if schema_found:
                valid_files_dir=directory+'/'+valid_files
                if not os.path.exists(valid_files_dir):
                    os.makedirs(valid_files_dir)

                # Define the source and destination file paths
                source_file_path = os.path.join(directory, filename)
                destination_file_path = os.path.join(valid_files_dir, filename)
                
                # Move the file
                if os.path.exists(source_file_path):
                    shutil.move(source_file_path, destination_file_path)
                else:
                    print(f"File '{filename}' does not exist in '{directory}'.")
                
       
            else:
                new_schema_no=len(valid_schema_list)
                generate_schema(invalid_schema_directory,file_path, new_schema_no)  
                print('New schema generated') 
                break 
            count=count+1
            
loop_through_files(directory)

Length of files in directory:  3
No file to validate!


In [148]:
# print(output_dict)
# for key, value in output_dict.items():
#     print(f"{key}: {len(value)}")

# output_fn=directory+'_output.txt'
# with open(output_fn, 'w') as f:
#     f.write(str(output_dict))