In [60]:
import pandas as pd
import datetime
import numpy as np
import json


In [61]:
# Parse JSON schema
class FileSchema():

    def json_schema_parser(self, data):
        d_csv = {}
        d_csv = data["format"]["csv"]

        self.delimiter = data["format"]["csv"]['delimiter']
        self.compression = data["format"]["csv"]['compression']
        self.header = int(data["format"]["csv"]['header'])

        d_cols = data["format"]["columns"]

        self.col_names = []
        self.col_types = {}
        self.col_null = {}
        self.col_keys = {}
        self.col_dt_format = {}
        self.col_dates = []
        self.decimal = '.'
        self.thousand = None
        self.date_parser = None  
      

        for i in d_cols:
            #print(i, '=', d_cols[i])
            self.col_names.append(i)
            for j in d_cols[i]:
                if j == 'type':
                    tipo = d_cols[i][j]
                    if tipo == 'int':
                        self.col_types[i] = 'int64'
                    elif tipo == 'float':
                        self.col_types[i] = 'float64'
                    elif tipo == 'date':
                        #col_types[i] = 'datetime64[ns]'
                        self.col_dates.append(i)
                    else:
                        self.col_types[i] = 'object'

                elif j == 'nullable':
                    self.col_null[i] = d_cols[i][j]

                elif j == 'key':
                    self.col_keys[i] = d_cols[i][j]

                elif j == 'date_format':
                    self.col_dt_format[i] = d_cols[i][j]
                    self.date_parser = d_cols[i][j]

                elif j == 'decimal':
                    self.decimal = d_cols[i][j]

                elif j == 'thousand':
                    self.thousand = d_cols[i][j]

        print('delimiter = ', self.delimiter)
        print('compression = ', self.compression)
        print('header = ', self.header)            
        print('col names: ', self.col_names)
        print('col types: ', self.col_types)
        print('col null: ', self.col_null)
        print('col keys: ', self.col_keys)
        print('col date format: ', self.col_dt_format)
        print('col dates:', self.col_dates)
        print('thousand:', self.thousand)
        print('decimal:', self.decimal)


In [62]:
with open('format_sample_001.json') as json_file:
    json_data = json.load(json_file)
    print(data)


{'format': {'format_name': 'csv_sample_001', 'csv': {'delimiter': ';', 'compression': 'infer', 'header': '0', 'validate_keys': 'false'}, 'columns': {'CODIGO': {'type': 'int', 'nullable': 'false', 'key': 'true'}, 'NOME': {'type': 'string', 'nullable': 'false', 'size': '150'}, 'CPF': {'type': 'string', 'size': '14', 'nullable': 'true', 'key': 'false'}, 'NASCIMENTO': {'type': 'date', 'date_format': '%d/%m/%Y', 'min_date': '01/01/1900', 'max_date': '31/12/2050', 'nullable': 'true', 'key': 'false'}, 'SALDO': {'type': 'float', 'nullable': 'yes', 'key': 'false', 'decimal': ',', 'thousand': '.', 'negative': 'true', 'currency_symbol': 'R$'}}}}


In [63]:
fmt = FileSchema()
fmt.json_schema_parser(json_data)

delimiter =  ;
compression =  infer
header =  0
col names:  ['CODIGO', 'NOME', 'CPF', 'NASCIMENTO', 'SALDO']
col types:  {'CODIGO': 'int64', 'NOME': 'object', 'CPF': 'object', 'SALDO': 'float64'}
col null:  {'CODIGO': 'false', 'NOME': 'false', 'CPF': 'true', 'NASCIMENTO': 'true', 'SALDO': 'yes'}
col keys:  {'CODIGO': 'true', 'CPF': 'false', 'NASCIMENTO': 'false', 'SALDO': 'false'}
col date format:  {'NASCIMENTO': '%d/%m/%Y'}
col dates: ['NASCIMENTO']
thousand: .
decimal: ,


In [64]:
p_csv = 'sample_file_01.csv'
df1 = pd.read_csv(p_csv, delimiter = fmt.delimiter , encoding='UTF-8')
df1 = df1.reset_index()

In [65]:
df1.dtypes

index          int64
CODIGO        object
NOME          object
CPF           object
NASCIMENTO    object
SALDO         object
dtype: object

In [66]:
def convert_columns(row, col, tipo, fmt, log):
    try:
        #original = str(row[col])
        original = row[col]
        x = ''

        if tipo == 'int':
            ans = int(original)
            
        elif tipo == 'float':
            v_float = original
            if v_float and fmt.thousand:
                v_float = v_float.replace(fmt.thousand, '')
            if v_float and fmt.decimal:
                v_float = v_float.replace(fmt.decimal, '.')
            ans = float(v_float)
            
        elif tipo == 'date':
            ans = datetime.datetime.strptime(original, fmt.date_parser)
           
    except Exception as erro:
        v_row = str(row['index'] + 2)
        v_key = 'row:'+  v_row + ',col:'+col
        error_str = 'Linha {l} , Coluna {c} : Erro ao tentar converter o valor \'{o}\' para o tipo {t}. | Exception: {e}'.format(
                        l = v_row, o = str(original), t= str(tipo), c = col, e = erro)
        log[v_key] = error_str
        ans = pd.np.nan
        
    return ans


In [67]:
flog = {}
df1['CODIGO'] = df1.apply(convert_columns, axis=1, col='CODIGO', tipo='int', fmt = fmt, log = flog)
df1['SALDO'] = df1.apply(convert_columns, axis=1, col='SALDO', tipo='float', fmt = fmt, log = flog)
df1['NASCIMENTO'] = df1.apply(convert_columns, axis=1, col='NASCIMENTO', tipo='date', fmt = fmt, log = flog)

In [68]:
flog

{'row:9,col:CODIGO': "Linha 9 , Coluna CODIGO : Erro ao tentar converter o valor 'x' para o tipo int. | Exception: invalid literal for int() with base 10: 'x'",
 'row:15,col:CODIGO': "Linha 15 , Coluna CODIGO : Erro ao tentar converter o valor 'nan' para o tipo int. | Exception: cannot convert float NaN to integer",
 'row:9,col:SALDO': "Linha 9 , Coluna SALDO : Erro ao tentar converter o valor 'x' para o tipo float. | Exception: could not convert string to float: 'x'",
 'row:15,col:SALDO': "Linha 15 , Coluna SALDO : Erro ao tentar converter o valor 'nan' para o tipo float. | Exception: 'float' object has no attribute 'replace'",
 'row:9,col:NASCIMENTO': "Linha 9 , Coluna NASCIMENTO : Erro ao tentar converter o valor 'x' para o tipo date. | Exception: time data 'x' does not match format '%d/%m/%Y'",
 'row:15,col:NASCIMENTO': "Linha 15 , Coluna NASCIMENTO : Erro ao tentar converter o valor 'nan' para o tipo date. | Exception: strptime() argument 1 must be str, not float"}

In [71]:
import csv
f = open(p_csv + ".err", "w")
w = csv.writer(f, lineterminator = '\n')
for key, val in flog.items():
    w.writerow([val])
f.close()

In [70]:
df1.head(15)

Unnamed: 0,index,CODIGO,NOME,CPF,NASCIMENTO,SALDO
0,0,1001.0,Tina Voldaren,100.100.100-10,1980-11-15,459.0
1,1,1002.0,Ana Fier,200.200.200-20,2000-01-01,500.0
2,2,1003.0,Samantha Smallcock,300.200.100-30,1977-07-31,3333.3
3,3,1004.0,Jennifer Ravena (Jinni),400.300.200-10,1944-04-04,112540.5
4,4,1005.0,Barbara Patrick,100.100.100-11,1999-12-25,24117.0
5,5,1006.0,Selena Herrick (Sena),200.200.200-21,1980-11-15,20177.87
6,6,1007.0,Sarah Varney (Sadie),300.200.100-31,2000-01-01,114.57
7,7,,x,x,NaT,
8,8,1008.0,Vera Cassidy,400.300.200-11,1977-07-31,51.27
9,9,1009.0,Genésia Aparecida,100.100.100-12,1944-04-04,-12.03
