In [1]:
# create dataset

from dataset_creator import FakeDataset, MISSING_SYMBOLS

filename = 'dataset.csv'
OUTLIER_PERCENTAGE = 0.1
DUPLICATE_PERCENTAGE = 0.15
MISSING_PERCENTAGE = 0.1


dataset = FakeDataset(dataset_size = 100)\
        .add_outliers_above(outlier_percentage = OUTLIER_PERCENTAGE)\
        .add_duplicates(duplicate_percentage = DUPLICATE_PERCENTAGE)\
        .add_missing(missing_percentage = MISSING_PERCENTAGE)\
        .to_csv(filename)

  self.data = self.data.append(self.data.iloc[self.data.sample(frac=duplicate_percentage).index, :])


In [2]:
# read data
import numpy as np
import pandas as pd

data = pd.read_csv(filename)
print(data.shape)
data.head()

(115, 7)


Unnamed: 0,name,surname,birthdate,results1,results2,category,email
0,Charlotte,Brady,1937-10-17,555,,B,starkamanda@example.net
1,,Ayala,1950-06-12,88,,none,perkinscynthia@example.com
2,,Richards,2019-08-26,50,-0.023403,C,benjamin97@example.net
3,Maureen,Pierce,1933-09-15,73,,C,juliawarren@example.net
4,Amy,Allen,1943-08-14,42,0.366274,A,pbarrett@example.com


In [3]:
dataset_scores = dict()

## Check if there are any missing values

In [4]:
data.replace(MISSING_SYMBOLS, np.nan, inplace=True)

In [5]:
data.isna().sum()

name         12
surname      12
birthdate    12
results1     12
results2     12
category     12
email        12
dtype: int64

In [6]:
data = data._convert(numeric=True, datetime=True).convert_dtypes()


In [7]:
dataset_scores["missing_percentage"] = data.isna().sum().sum()/data.size
dataset_scores["most_missing_column"] = data.isna().sum().max()/data.shape[0]

In [8]:
assert(round(dataset_scores["missing_percentage"],2) == MISSING_PERCENTAGE)

In [9]:
data.results1.astype("float")

0      555.0
1       88.0
2       50.0
3       73.0
4       42.0
       ...  
110     17.0
111     54.0
112     78.0
113     72.0
114     97.0
Name: results1, Length: 115, dtype: float64

## Check duplicates

In [10]:
dataset_scores["duplication_percentage"] = sum(data.duplicated())/ data.shape[0]

## Check outliers

For numerical values we use the same method as in box plot (outlier is more tham q3 + 1.5 IQR or less than q1 - 1.5 IQR)

In [30]:
numeric_cols = data.select_dtypes(include=['number']).columns
outliers_nums = []

for col in numeric_cols:
    
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    
    iqr = q3-q1
    
    upper_bound = q3 + (1.5*iqr)
    lower_bound = q1 - (1.5*iqr)

    outliers_nums.append(np.sum((data[col] > upper_bound) | (data[col] < lower_bound)))

dataset_scores["outliers_percentage"] = sum(outliers_nums)/data.size
dataset_scores["most_outliers_column"] = max(outliers_nums)/data.shape[0]

For string columns we look for rare values (less than 1% of the observations) or dominant values (more than 90% of the observations)

Other ideas: correlation, not good dates, mishmashed formats, upper and lower cased, duplicates, is it actual, are all values the same...

# Create badges

In [13]:
dataset_scores

{'missing_percentage': 0.10434782608695652,
 'most_missing_column': 0.10434782608695652,
 'duplication_percentage': 0.017391304347826087,
 'outliers_percentage': 0.0,
 'most_outliers_column': 0.0}

In [14]:
# save obtained scores to json
import json
 
filename="./badge_data.json"
json_object = json.dumps(dataset_scores, indent=4)
 
with open(filename, "w") as outfile:
    outfile.write(json_object)

In [15]:
repo_url = "https://github.com/annapanfil/data_quality_labeler" #todo: get dinamically

ownername = repo_url.split("/")[3]
repo_name = repo_url.split("/")[4]
branch = "main"


print("To add badges paste this to your readme.md file:")
for badge in dataset_scores.keys():
    print(f"![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2F{ownername}%2F{repo_name}%2F{branch}%2F{filename}&query=%24.{badge}&label={badge})")


To add badges paste this to your readme.md file:
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.missing_percentage&label=missing_percentage)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.most_missing_column&label=most_missing_column)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.duplication_percentage&label=duplication_percentage)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.outliers_percentage&label=outliers_percentage)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw