## Generate dataset

In [5]:
from faker import Faker
import numpy as np
import pandas as pd
import random

fake = Faker()

In [7]:
columns=["name", "surname", "birthdate", "results1", "results2", "category", "email"]
missing_symbols = [None, "null", "", "NULL", "N/A", "n/a", "NA", "na", "NaN", "nan", "None", "none", "NONE"]

dataset_size = 100
missing_percentage = [0.1]*len(columns)

data = pd.DataFrame(columns=columns)

data["name"],data["surname"] = zip(*[fake.name().split() for _ in range(dataset_size)])
data["birthdate"] = [fake.date_of_birth() for _ in range(dataset_size)]
data["results1"] = [random.randint(0,100) for _ in range(dataset_size)]
data["results2"] = [random.random() for _ in range(dataset_size)]
data["category"] = random.choices(["A", "B", "C"], k=dataset_size)
data["email"] = [fake.email() for _ in range(dataset_size)]

## mishmash a bit
# missing
for column_no, missing_p in enumerate(missing_percentage):
    data.iloc[data.sample(frac=missing_p).index, column_no] = random.choices(missing_symbols, k=int(missing_p*dataset_size))

# future dates in birthdate

# outliers

# lowercase categories

# not int results

# ...

data.head()

Unnamed: 0,name,surname,birthdate,results1,results2,category,email
0,Joshua,Frank,2003-10-08,,0.690497,B,xmcdonald@example.net
1,Nathan,Morris,1979-03-15,83.0,0.406892,B,
2,James,Santiago,1962-10-16,27.0,0.3637,B,qwright@example.org
3,Keith,,1936-02-14,82.0,0.140616,B,
4,Karen,Krueger,1955-05-14,22.0,0.579501,,none


In [8]:
data.isnull().sum()

name         0
surname      0
birthdate    0
results1     0
results2     0
category     1
email        2
dtype: int64

# Start analysis

## Check if there are any missing values

In [9]:
dataset_scores = dict()

In [10]:
data.replace(missing_symbols, np.nan, inplace=True)

In [11]:
data.isna().sum()

name         10
surname      10
birthdate    10
results1     10
results2     10
category     10
email        10
dtype: int64

In [12]:
dataset_scores["missing_percentage"] = data.isna().sum().sum()/data.size
dataset_scores["most_missing_column"] = data.isna().sum().max()/data.shape[0]

Other ideas: correlation, not good dates, mishmashed formats, upper and lower cased...

# Create a badge

In [13]:
# save obtained scores to json
import json
 
filename="./badge_data.json"
json_object = json.dumps(dataset_scores, indent=4)
 
with open(filename, "w") as outfile:
    outfile.write(json_object)

In [15]:
repo_url = "https://github.com/annapanfil/data_quality_labeler" #todo: get dinamically

ownername = repo_url.split("/")[3]
repo_name = repo_url.split("/")[4]
branch = "main"


print("To add badges paste this to your readme.md file:")
for badge in dataset_scores.keys():
    print(f"![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2F{ownername}%2F{repo_name}%2F{branch}%2F{filename}&query=%24.{badge}&label={badge})")


To add badges paste this to your readme.md file:
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.missing_percentage&label=missing_percentage)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.most_missing_column&label=most_missing_column)
