## Generate dataset

In [68]:
from faker import Faker
import numpy as np
import pandas as pd
import random

fake = Faker()

In [69]:
dataset_size = 100
missing_percentage = [0.1]*len(columns)

columns=["name", "surname", "birthdate", "results1", "results2", "category", "email"]
missing_symbols = [None, "null", "", "NULL", "N/A", "n/a", "NA", "na", "NaN", "nan", "None", "none", "NONE"]

data = pd.DataFrame(columns=columns)

data["name"],data["surname"] = zip(*[fake.name().split() for _ in range(dataset_size)])
data["birthdate"] = [fake.date_of_birth() for _ in range(dataset_size)]
data["results1"] = [random.randint(0,100) for _ in range(dataset_size)]
data["results2"] = [random.random() for _ in range(dataset_size)]
data["category"] = random.choices(["A", "B", "C"], k=dataset_size)
data["email"] = [fake.email() for _ in range(dataset_size)]

## mishmash a bit
# missing
for column_no, missing_p in enumerate(missing_percentage):
    data.iloc[data.sample(frac=missing_p).index, column_no] = random.choices(missing_symbols, k=int(missing_p*dataset_size))

# future dates in birthdate

# outliers

# lowercase categories

# not int results

# ...

data.head()

Unnamed: 0,name,surname,birthdate,results1,results2,category,email
0,Laura,Robbins,2009-04-07,86,0.909423,B,kyle02@example.com
1,Douglas,Morris,1915-08-08,82,0.689553,A,rhonda83@example.org
2,Jesus,,1936-01-10,55,0.007598,C,jameshartman@example.com
3,Melanie,Morales,1982-09-17,59,,C,ycruz@example.com
4,Sylvia,Miller,1923-06-04,13,0.059989,B,


In [72]:
data.isnull().sum()

name         0
surname      0
birthdate    1
results1     1
results2     1
category     0
email        0
dtype: int64

# Start analysis

## Check if there are any missing values

In [91]:
dataset_scores = dict()

In [92]:
data.replace(missing_symbols, np.nan, inplace=True)

In [93]:
data.isna().sum()

name         10
surname      10
birthdate    10
results1     10
results2     10
category     10
email        10
dtype: int64

In [94]:
dataset_scores["missing_percentage"] = data.isna().sum().sum()/data.size
dataset_scores["most_missing_column"] = data.isna().sum().max()/data.shape[0]

Other ideas: correlation, not good dates, mishmashed formats, upper and lower cased...

# Create a badge

In [98]:
# save obtained scores to json
import json
 
filename="./badge_data.json"
json_object = json.dumps(dataset_scores, indent=4)
 
with open(filename, "w") as outfile:
    outfile.write(json_object)

In [100]:
repo_url = "https://github.com/annapanfil/data_quality_labeler" #todo: get dinamically

print("To add badges paste this to your readme.md file:")
for badge in dataset_scores.keys():
    print(f"![DQ Badge](https://img.shields.io/badge/dynamic/json?url={repo_url}/blob/main/{filename})&query=%24.{badge}&label={badge}")


to add badges paste this to your readme.md file:
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https://github.com/annapanfil/data_quality_labeler/blob/main/./badge_data.json)&query=%24.missing_percentage&label=missing_percentage
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https://github.com/annapanfil/data_quality_labeler/blob/main/./badge_data.json)&query=%24.most_missing_column&label=most_missing_column
