# Duplicate header transformation challenge

In [None]:
from pprint import pprint
from frictionless import Package, Resource, transform, steps, describe, Schema, validate, Dialect, table
from tabulate import tabulate
import csv
import numpy as np
import pandas as pd

# https://framework.frictionlessdata.io/docs/guides/validating-data.html#validation-report


In [None]:
schema = Schema.describe('Purdue_ACRE_DTC_2021_1_1.csv')
schema.to_yaml("harvest.schema.yaml")

df='Purdue_ACRE_DTC_2021_1_1.csv'

### Using pandas to open file
In Pandas, Duplicate fields are read as Field and Field.1. I also noticed that pandas dataframe sometimes reads fields with integer type as numpy integer. I was running into errors with this previously but I think frictionless validation does not have problem with this (it reads this as integer type)

In [None]:
file = pd.read_csv(df, encoding='mbcs')  

# Check instance of field
print('Product field instance of np.int64:',isinstance(file['Product'][0], np.int64), '...',
      'Product field instance of int: ',isinstance(file['Product'][0], int))

schema_file = "harvest.schema2.yaml"
schema_one = Schema.from_descriptor(schema_file) # from a descriptor path

try:
    report = validate(file, schema=schema_one,limit_errors=1)
    # print(report.valid)
    if(report.valid == False):
        pprint(report)
except UnicodeDecodeError as e:
    print(f"Error decoding file: {e}")

### Using python's csv library
I wrote this custom code to identify a duplicate field using python csv library. The end goal is to transform the files with duplicate fields.

In [None]:
duplicate_col_files=[]
def check_and_update_duplicate_columns(file_path, destination_folder):
    with open(file_path, mode='r', newline='', encoding='mbcs') as file:
        reader = csv.reader(file)
        headers = next(reader)
        data = list(reader)

    # Check for duplicate columns and update names
    seen = {}
    new_headers = []
    for col in headers:
        if col in seen:
            seen[col] += 1
            new_headers.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            new_headers.append(col)

    duplicate_columns = [col for col, count in seen.items() if count > 0]

    if duplicate_columns:
        print("Duplicate columns found:", duplicate_columns)
       


check_and_update_duplicate_columns(df, 'duplicate_col_folder')

### Duplicate header using Frictionless data transform
Jarod mentioned that he wants minimal custom code so we can document the pipeline based on Frictionless glossary. So I tried using Frictionless transform

In Frictionless, Duplicate fields are read as Field and Field2. The challenge is that I cannot perform any function (move, pack, merge etc) on the duplicate field (Field2) except to remove the field (https://framework.frictionlessdata.io/docs/steps/field.html#remove-field). This means I cannot directly work with Field2 and in someways, Frictionless does not 'identify' it. 

I used this method on a demo csv file and it worked but it fails when I use on the file from SMS 

In [None]:
source = Resource(path="harvest/all_harvest/Purdue_ACRE_DTC_2021_1_1.csv")
target = transform(
    source,
    steps=[

        steps.field_merge(name="name4", from_names=["Diff Status"], preserve=False), # Change Field to a temporary name (name4). This automatically changes Field2 to Field

        # This step is throwing errors (when you run target.to_view()). I tried it on a demo csv file and it worked
        steps.field_add(name="name3", formula='Diff Status*1'), # A new field (name3) is created from the duplicate (now identified as Field) so that Frictionless can identify
        
        steps.field_remove(names=["Diff Status2"]), # Get rid of the duplicate (now identified as Field)
        steps.field_merge(name="Diff Status", from_names=["name4"], preserve=False), # Rename temp field to original name
        steps.field_merge(name="Diff Status2", from_names=["name3"], preserve=False), # Rename temp field to original name
        steps.field_move(name="Diff Status", position=14), # Move Field back to original position
        steps.field_move(name="Diff Status2", position=20), # Move Field back to original position

    ]
)
print(target.schema)
print(target.to_view())

In [None]:
report = validate(target, schema=schema,limit_errors=2)
if(report.valid == False):
    pprint(report)