In [118]:
from pprint import pprint
from frictionless import Schema, fields, describe, validate
import os
import pandas as pd

In [119]:
# Frictionless will not accept an absolute path. You must give it relative paths
# !dir
directory = "all_plant"
schema_file = "acre_plant2.schema.yaml"
schema_file_two = "acre_plant3.schema.yaml"

In [120]:
schema_one = Schema.from_descriptor(schema_file) # from a descriptor path
schema_two = Schema.from_descriptor(schema_file_two) # from a descriptor path


In [121]:
schema_one_files={'count':0, 'fnames':[]}
schema_two_files={'count':0, 'fnames':[]}



In [122]:
def is_integer(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [123]:
def loop_through_files(directory, limit=None):
    """Loops through all files in a directory and prints their contents.

    Args:
        directory (str): The path to the directory containing the files.
        limit (int, optional): The maximum number of files to process.
            Defaults to None (no limit).
    """

    if not os.path.isdir(directory):
        raise ValueError(f"Error: '{directory}' is not a valid directory.")

    file_count = 0
    file_names = os.listdir(directory)
    if(file_names):
        file_names.sort(reverse=True)

    print('Length of files in directory: ',len(file_names))

    schema_one_count=schema_two_count=0

    for filename in file_names:
        if filename.startswith('.'):
            continue  # Skip hidden files
        if filename.endswith('.yaml'):
            continue  # Skip schema
        if filename.endswith('.ipynb'):
            continue  # Skip ipynb files
        field_no=filename.split('_')[2]
        
        if is_integer(field_no):
            schema=schema_one

            schema_one_count+=1
            schema_one_files['count']=schema_one_count
            schema_one_files['fnames'].append(filename)



        else:
            schema=schema_two
            schema_two_count+=1
            schema_two_files['count']=schema_two_count
            schema_two_files['fnames'].append(filename)
        
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path): 

            df = pd.read_csv(file_path, encoding='mbcs')
            
            # Check for file that does not have empty soil type field
            if not pd.isna(df['Soil Type'].iloc[0]):
                print(f'{filename} does not have empty soil type field')


            file_cols=list(df.columns)
            schema_one_col=schema.field_names

            if file_cols != schema_one_col:
                print('File {filename} has a different column arrangement')

            if set(file_cols) == set(schema_one_col):
                df=df[schema_one_col]
            
            if limit is None or file_count < limit:
                try:
                    report = validate(df, schema=schema,limit_errors=3)
                    print("{}. {} = {}".format(file_count+1,file_path,report.valid))
                    if(report.valid == False):
                        pprint(report)
                        return
                    file_count += 1
                except UnicodeDecodeError as e:
                    print(f"Error decoding file '{filename}': {e}")
                except Exception as e:
                    print(f"Error reading file '{filename}': {e}")

            if limit is not None and file_count >= limit:
                break
    return schema_one_count, schema_two_count

In [124]:
# loop_through_files(directory, file_limit)
schema_one_count, schema_two_count = loop_through_files(directory)

Length of files in directory:  192


In [125]:
# Erroe directory
directory_path = 'error'

# Get a list of all files in the directory
error_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]


In [126]:

all_files=[{schema_file:[schema_one_count,schema_one_files['fnames']]}, {schema_file_two:[schema_two_count, schema_two_files['fnames']]}, {'error':[len(error_files),error_files]}]

with open('acre_plant_output.txt', 'w') as f:
    f.write(str(all_files))