# Using DataQuality Library

In [1]:
# Reading file using Pandas
import pandas as pd

encoding = 'iso-8859-1'

# File to be analyzed
people = pd.read_csv('people.csv',
            delimiter = ';',
            encoding = encoding)
people.head()

Unnamed: 0,name,job,sex,age,salary,project,email
0,Inés,Front Developer,F,35,72000,Project B,Maritza@domain.com
1,Toño,Software Developer,M,33,72000,Project A,Deividomain.com
2,Carmiña,Software Developer,M,27,72000,Project B,Manuela@domain.com
3,Antony,Software Developer,M,24,72000,Project A,Antony@domain.com
4,Martha,Front Developer,F,30,74000,Project A,Martha@domain.com


In [2]:
# Info about people file
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 7 columns):
name       168 non-null object
job        167 non-null object
sex        168 non-null object
age        167 non-null object
salary     168 non-null object
project    165 non-null object
email      164 non-null object
dtypes: object(7)
memory usage: 9.3+ KB


# Importing library

In [3]:
import Data_Quality as DQ

# Start data quality rules 
dataWork = DQ.Rules('people.csv', 
                    ';', 
                    False) # False for retrieving bad registers only

# Rule for checking length
This rules checks for length of column. In this case the rule returns all registers with more than 12 characters

In [4]:
print(dataWork.checkMaxLength('job', 12))

     index        name                 job sex  age  salary    project  \
0        0        Inés     Front Developer   F   35   72000  Project B   
1        1        Toño  Software Developer   M   33   72000  Project A   
2        2     Carmiña  Software Developer   M   27   72000  Project B   
3        3      Antony  Software Developer   M   24   72000  Project A   
4        4      Martha     Front Developer   F   30   74000  Project A   
5        5       Deivi  Software Developer   M   24   74000  Project B   
6        6       Deivi  Software Developer   M   40   74000  Project A   
7        7     Marisol     Front Developer   F   22   74000  Project A   
8        8      Martha     Front Developer   F   32   75000  Project A   
9        9      Antony  Software Developer   M   21   77000  Project A   
10      10      Antony  Software Developer   M   40   77000  Project A   
11      11       Lucia  Software Developer   F   32   77000  Project A   
12      12     Melanye  Software Devel

# Rule for list
This rule checks data against a reference list.

In [5]:
# Reference List for rule.
pd.read_csv('jobs.csv')

Unnamed: 0,job
0,Front Developer
1,Software Developer
2,Data Engineer
3,Manager


In [6]:
# Checking for registers not in reference list for column "job"
# Nan and Intern not in reference list
print(dataWork.checkListReference('jobs.csv', 'job', 'job')[['name','job']])

    name     job
0  David     NaN
1    Ana  Intern
2  Johan  Intern


# Rule for email
This rule checks for email structure.

In [7]:
# Checking for bad registers in column "email", returning emails with wrong structure 
print(dataWork.checkEmail('email')[['name','email']])

        name               email
0       Toño     Deividomain.com
1      Lucia                 NaN
2        Ana     Ana@@domain.com
3     Andrew  Andrew@domain..com
4     Andrea                   A
5     Manolo                 NaN
6   Anabelle                 NaN
7    Joaquin                 123
8      Peter                 NaN
9      Lucia                 NaN
10    Manolo                 NaN
11  Anabelle                 NaN
12     Peter                 NaN


# Rule for null values
This rule checks for missing values.

In [8]:
# Checking registers for null values in project column
print(dataWork.checkNull('project')[['name','project']])

         name project
0      Justin     NaN
1      Stella     NaN
2  Margarette     NaN


# Rule for numbers
This rule checks for number type. Non numbers will be mark as wrong

In [9]:
# Checking registers for number types
print(dataWork.checkNumber('salary')[['name', 'salary']])


      name  salary
0    Johan   97a00
1  Joaquin  1020o0


# Rule for name
This rule checks the names. Only allows letters and spaces.

In [10]:
print(dataWork.checkName('name')[['name']])

      name
0  Ant0nio
1    J3nny
2    Ang3l
3   Ne/son
4    Juan0
5   Pedro_


# Closing 
Removes and deletes all files generated during data quality process

In [11]:
dataWork.close()