# Using DataQuality Library
This demo shows the way to use the dataQuality library.

In [1]:
# Reading file using Pandas
import pandas as pd

encoding = 'iso-8859-1'

# File to be analyzed
people = pd.read_csv('people.csv',
            delimiter = ';',
            encoding = encoding)
people.head(5)

Unnamed: 0,name,job,sex,age,salary,project,email
0,Inés,Front Developer,F,35,72000,Project B,Maritza@domain.com
1,Toño,Software Developer,M,33,72000,Project A,Deividomain.com
2,Carmiña,Software Developer,M,27,72000,Project B,Manuela@domain.com
3,Antony,Software Developer,M,24,72000,Project A,Antony@domain.com
4,Martha,Front Developer,F,30,74000,Project A,Martha@domain.com


In [2]:
# Info about people file
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 7 columns):
name       168 non-null object
job        167 non-null object
sex        168 non-null object
age        167 non-null object
salary     168 non-null object
project    165 non-null object
email      164 non-null object
dtypes: object(7)
memory usage: 9.3+ KB


# Importing library
Data quality library receives three parameters:
1. File to be analized
2. delimiter
3. a boolean identifier to retrieving records that do not comply with the rule (False) or records that do comply with the rule (True).

For this demo we are going to check for bad records.

In [18]:
import libraries.Data_Quality as DQ

# Start data quality rules 
dataWork = DQ.Rules('people.csv', 
                    ';', 
                    True) # False for retrieving bad registers only

# Rule for checking specific words
This rules checks for specific words on records on certain column. In this case the rule returns all records with "Intern" word.

In [4]:
print(dataWork.checkContains('job', 'Intern').head())


   index   name     job sex   age salary    project             email
0      3    Ana  Intern   F  28.0  80000  Project A    Ana@domain.com
1      0  Johan  Intern   M  27.0  80000  Project A  Johan@domain.com


# Rule for checking length
This rules checks for length of records on certain column. In this case the rule returns all records with more than 12 characters.

In [5]:
print(dataWork.checkMaxLength('job', 20).head())

   index     name                 job sex age salary    project  \
0      0     Inés     Front Developer   F  35  72000  Project B   
1      1     Toño  Software Developer   M  33  72000  Project A   
2      2  Carmiña  Software Developer   M  27  72000  Project B   
3      3   Antony  Software Developer   M  24  72000  Project A   
4      4   Martha     Front Developer   F  30  74000  Project A   

                email  
0  Maritza@domain.com  
1     Deividomain.com  
2  Manuela@domain.com  
3   Antony@domain.com  
4   Martha@domain.com  


# Rule for list
This rule checks data against a reference list.

In [6]:
# Reference List for rule.
pd.read_csv('jobs.csv')

Unnamed: 0,job
0,Front Developer
1,Software Developer
2,Data Engineer
3,Manager


In [7]:
# Checking for registers not in reference list for column "job"
# Nan and Intern not in reference list
print(dataWork.checkListReference('jobs.csv', 'job', 'job')[['name','job']])

           name                 job
0          Inés     Front Developer
1          Toño  Software Developer
2       Carmiña  Software Developer
3        Antony  Software Developer
4        Martha     Front Developer
5         Deivi  Software Developer
6         Deivi  Software Developer
7       Marisol     Front Developer
8        Martha     Front Developer
9        Antony  Software Developer
10       Antony  Software Developer
11        Lucia  Software Developer
12      Melanye  Software Developer
13          Ana  Software Developer
14          Ana  Software Developer
15      Ant0nio  Software Developer
16        Johan  Software Developer
17        Johan  Software Developer
18        J3nny  Software Developer
19       Andrew     Front Developer
20       Justin       Data Engineer
21       Andrea     Front Developer
22     Angelica     Front Developer
23       Manolo       Data Engineer
24         Bart  Software Developer
25         Bert  Software Developer
26      Belinda  Software De

# Rule for email
This rule checks for email structure.

In [8]:
# Checking for bad registers in column "email", returning emails with wrong structure 
print(dataWork.checkEmail('email')[['name','email']].head())

      name               email
0     Inés  Maritza@domain.com
1  Carmiña  Manuela@domain.com
2   Antony   Antony@domain.com
3   Martha   Martha@domain.com
4    Deivi    Deivi@domain.com


# Rule for null values
This rule checks for missing values.

In [9]:
# Checking registers for null values in project column
print(dataWork.checkNull('project')[['name','project']])

           name    project
0          Inés  Project B
1          Toño  Project A
2       Carmiña  Project B
3        Antony  Project A
4        Martha  Project A
5         Deivi  Project B
6         Deivi  Project A
7       Marisol  Project A
8        Martha  Project A
9        Antony  Project A
10       Antony  Project A
11        Lucia  Project A
12      Melanye  Project B
13          Ana  Project A
14          Ana  Project A
15      Ant0nio  Project A
16        Johan  Project A
17        Johan  Project B
18        David  Project A
19        J3nny  Project A
20       Andrew  Project B
21       Justin  Project A
22          Ana  Project A
23       Andrea  Project C
24     Angelica  Project A
25       Manolo  Project A
26         Bart  Project A
27         Bert  Project A
28      Belinda  Project A
29        Johan  Project A
..          ...        ...
135     Jacinto  Project B
136     Arnaldo  Project B
137     Esteban  Project B
138  Margarette  Project B
139      Nelson  Project B
1

# Rule for numbers
This rule checks for number type. Non numbers will be mark as wrong

In [10]:
# Checking registers for number types
print(dataWork.checkNumber('salary')[['name', 'salary']])


           name  salary
0          Inés   72000
1          Toño   72000
2       Carmiña   72000
3        Antony   72000
4        Martha   74000
5         Deivi   74000
6         Deivi   74000
7       Marisol   74000
8        Martha   75000
9          Inés   72000
10         Toño   72000
11      Carmiña   72000
12       Antony   72000
13       Martha   74000
14        Deivi   74000
15        Deivi   74000
16      Marisol   74000
17       Martha   75000
18       Antony   77000
19       Antony   77000
20        Lucia   77000
21      Melanye   77000
22          Ana   77000
23          Ana   77000
24      Ant0nio   77000
25        Johan   77000
26        Johan   77000
27        David   79000
28       Antony   77000
29       Antony   77000
..          ...     ...
304  Margarette  110000
305       Peter  110000
306     Esteban   97000
307       Johan   97000
308      Angela   98000
309         Ned  100000
310       Jenny  102000
311     Joaquin  1020o0
312       Peter  102000
313      Andrea 

# Rule for name
This rule checks the names. Only records with allows letters and spaces pass the rule.

In [11]:
print(dataWork.checkName('name')[['name']])

           name
0          Inés
1          Toño
2       Carmiña
3        Antony
4        Martha
5         Deivi
6         Deivi
7       Marisol
8        Martha
9          Inés
10         Toño
11      Carmiña
12       Antony
13       Martha
14        Deivi
15        Deivi
16      Marisol
17       Martha
18       Antony
19       Antony
20        Lucia
21      Melanye
22          Ana
23          Ana
24        Johan
25        Johan
26        David
27       Antony
28       Antony
29        Lucia
..          ...
300       Peter
301      Andrea
302  Margarette
303       Peter
304     Esteban
305       Johan
306      Angela
307         Ned
308       Jenny
309     Joaquin
310       Peter
311      Andrea
312  Margarette
313       Peter
314        John
315        John
316       Peter
317       Dayan
318       Pedro
319        Mary
320       Dayan
321        John
322       Juan0
323        John
324      Pedro_
325       Peter
326       Dayan
327       Pedro
328        Mary
329       Dayan

[330 ro

# Checking for Generic Pattern

This rules checks for an specific pattern

In [12]:
print(dataWork.checkPattern('name', '^[A-Za-z]+$')[['name']])

           name
0        Antony
1        Martha
2         Deivi
3         Deivi
4       Marisol
5        Martha
6          Inés
7          Toño
8       Carmiña
9        Antony
10       Martha
11        Deivi
12        Deivi
13      Marisol
14       Martha
15       Antony
16       Antony
17        Lucia
18      Melanye
19          Ana
20          Ana
21        Johan
22        Johan
23        David
24       Antony
25       Antony
26        Lucia
27      Melanye
28          Ana
29          Ana
..          ...
297       Peter
298      Andrea
299  Margarette
300       Peter
301     Esteban
302       Johan
303      Angela
304         Ned
305       Jenny
306     Joaquin
307       Peter
308      Andrea
309  Margarette
310       Peter
311        John
312        John
313       Peter
314       Dayan
315       Pedro
316        Mary
317       Dayan
318        John
319       Juan0
320        John
321      Pedro_
322       Peter
323       Dayan
324       Pedro
325        Mary
326       Dayan

[327 ro

# Closing 
Removes and deletes all files generated during data quality process.

In [13]:
dataWork.close()