In [2]:
import goodtables
import pandas as pd

In [21]:
from goodtables import validate, check

In [11]:
!ls "csvs/west bengal/2016-17/18. Public Works, Roads/demand_no_25_major_head_2049_detailed.csv" 

'csvs/west bengal/2016-17/18. Public Works, Roads/demand_no_25_major_head_2049_detailed.csv'


In [17]:
filename = 'csvs/west bengal/2016-17/18. Public Works, Roads/demand_no_25_major_head_2049_detailed.csv'


In [18]:
data = pd.read_csv(filename)

In [20]:
data

Unnamed: 0,Budget Code,Particulars,Voted/Charged,"Actuals, 2013-2014 Rs","Budget Estimate, 2015-2016 Rs","Revised Estimate, 2015-2016 Rs","Budget Estimate, 2016-2017 Rs"
0,[04],04 - INTEREST ON LOANS AND ADVANCES FROM,,,,,
1,,CENTRAL GOVERNMENT,,,,,
2,[103],103- Interest On Loans For Centrally Sponsored...,,,,,
3,,Schemes,,,,,
4,,Loans for State Roads for Economic or Inter- S...,Charged,,...,...,...
5,,Total - 2049-04-103,,,...,...,...
6,,,Voted,,...,...,...
7,,,Charged,,...,...,...
8,[60],60 - INTEREST ON OTHER OBLIGATIONS,,,,,
9,[701],701- Miscellaneous,,,,,


In [None]:
validation_checks = {''}

In [12]:
report = validate(filename)


In [14]:
report.viewkeys()



In [16]:
report['error-count']

0

In [35]:
import re

In [211]:
re.findall(r'^\d*', '04 - INTEREST ON LOANS AND ADVANCES FROM')

['04']

In [196]:
@check('missing-budget-code', type='custom', context='body')
def missing_budget_code(errors, cells, row_number):
    required_cells = {cell['header']: cell['value'] for cell in cells if cell['header'] in ['Budget Code', 
                                                                                            'Particulars']}
    budget_code = required_cells['Budget Code'].strip('[').strip(']')
    particulars = required_cells['Particulars'].strip()
    # check if particulars start with a number
    if len(re.findall(r'^\d', particulars)) > 0:
        # budget code should not be empty and should be part of particulars
        if len(budget_code) == 0 or budget_code not in particulars:
            errors.append({
                'code': 'Minor - Budget Code',
                'message': 'Budget Code missing for the particulars',
                'row-number': row_number,
                'column-number': 1,
            })

In [197]:
@check('numbers-only', type='custom', context='body')
def numbers_only(errors, cells, row_number):
    number_cells = [cell for cell in cells if 'Rs' in cell['header']]
    for number_cell in number_cells:
        # check if any text value came into the number columns
        if len(re.findall(r'[a-zA-Z]', number_cell['value'])) > 0:
            errors.append({
                'code': 'Major - Number Columns',
                'message': 'Text present in Number Columns',
                'row-number': row_number,
                'column-number': number_cell['number'],
            })

In [198]:
@check('header-count', type='custom', context='head')
def header_count(errors, cells, row_number, sample=None):
    numeric_headers = [cell['header'] for cell in cells if 'Rs' in cell['header']]
    categorical_headers = [cell['header'] for cell in cells if 'Rs' not in cell['header']]
    if len(numeric_headers) != 4 or len(categorical_headers) != 3:
        errors.append({
            'code': 'Major - Missing Headers',
            'message': 'The number of expected columns is less',
            'row-number': row_number,
            'column-number': 'N/A'
        })

In [199]:
@check('categorical-headers', type='custom', context='head')
def categorical_headers(errors, cells, row_number, sample=None):
    categorical_headers = [cell['header'] for cell in cells if 'Rs' not in cell['header']]
    fixed_categorical_headers = ['Budget Code', 'Particulars', 'Voted/Charged']
    diff = set(categorical_headers) - set(fixed_categorical_headers)
    if len(diff) > 0:
        errors.append({
            'code': 'Major - Missing Categorical Columns',
            'message': 'Missing either of the fixed columns: {0}'.format(fixed_categorical_headers),
            'row-number': row_number,
            'column-number': 'N/A'
        })

In [172]:
report = validate(filename, checks=['numbers-only', 'missing-budget-code', 
                                    'missing-header', 'header-count',
                                    'categorical-headers', 'duplicate-header'],
                  preset='table',  infer_schema=1, error_limit=20)

In [173]:
report


{u'error-count': 1,
 u'preset': 'table',
 u'table-count': 1,
 u'tables': [{u'encoding': None,
   u'error-count': 1,
   u'errors': [{u'code': u'format-error',
     u'column-number': None,
     u'message': 'Format "png" is not supported',
     u'row': None,
     u'row-number': None}],
   u'format': None,
   u'headers': [],
   u'row-count': 0,
   u'schema': None,
   u'scheme': None,
   u'source': '109.png',
   u'time': 0.0,
   u'valid': False}],
 u'time': 0.003,
 u'valid': False,

In [151]:
import os

In [200]:
csv_files = []

for source, _, filenames in os.walk('csvs/west bengal/2016-17/'):
    for filename in filenames:
        if 'demand' in filename and '.csv' in filename:
             csv_files.append({'source': os.path.join(source, filename)})

In [201]:
report = validate(csv_files, checks=['numbers-only', 'missing-budget-code', 
                                    'missing-header', 'header-count',
                                    'categorical-headers', 'duplicate-header'],
                  preset='nested', table_limit=len(csv_files))

In [202]:
report['error-count']

1307

In [203]:
tables_with_error = [table for table in report['tables'] if not table['valid'] > 0]

In [189]:
len(tables_with_error)

182

In [193]:
len(tables_with_error) / float(len(csv_files))

0.28888888888888886

In [206]:
len([table for table in tables_with_error if 'Minor - Budget Code' in table['errors'][0]['code']])

148

In [212]:
[table for table in tables_with_error if 'Major' in table['errors'][0]['code']]

[{u'encoding': 'utf-8',
  u'error-count': 1,
  u'errors': [{'code': 'Major - Missing Headers',
    'column-number': 'N/A',
    'message': 'The number of expected columns is less',
    u'row': None,
    'row-number': [[u'[01]', u'01 - INTEREST ON INTERNAL DEBT', u' '],
     [u'[200]', u'200- Interest on Other Intenal Debts', u' '],
     [u'', u'NP-Non Plan', u' '],
     [u'[45]', u'45- Interest/Dividend', u'Charged'],
     [u'', u'Total - 2049-01-200-NP-001', u''],
     [u'[034]', u'034- Loans from HUDCO [RL]', u' '],
     [u'[45]', u'45- Interest/Dividend', u'Charged'],
     [u'', u'Total - 2049-01-200-NP - Non Plan', u''],
     [u'', u'Total - 2049-01-200', u''],
     [u'', u' ', u'Voted'],
     [u'', u' ', u'Charged'],
     [u'[05]', u'05 - INTEREST ON RESERVE FUNDS', u' '],
     [u'[105]', u'105- Interest on General and Other Reserve Funds', u' '],
     [u'', u'NP-Non Plan', u' '],
     [u'[45]', u'45- Interest/Dividend', u'Charged'],
     [u'[002]', u'002- Interest on State Disaste