## Data Quality labelling
#### BDA project report

Report completed by:
* Anna Panfil
* Igor Czudy
* Juras Lukaševičius

In [1]:
# Setting directory

import os
os.chdir('D:/Users/Vartotojas/Documents/GitHub/data_quality_labeler')

In [2]:
# Creating dataset

from dataset_creator import create_dataset, MISSING_SYMBOLS
filename = create_dataset(dataset_size = 100, missing_percentage = 0.1, output_file="dataset.csv")

In [3]:
# Reading data
import numpy as np
import pandas as pd

data = pd.read_csv(filename)
data.head()

Unnamed: 0,name,surname,birthdate,results1,results2,category,email
0,Anthony,Hampton,1970-04-13,23,0.2189765068820564,C,rodriguezjonathan@example.com
1,Brian,Odom,1991-11-22,11,0.38665489707246736,C,rhondamccoy@example.org
2,Cameron,Meadows,1950-05-13,3,0.6289607608312131,,
3,Jack,Johnson,1974-12-21,13,none,C,zgarner@example.net
4,Veronica,Sullivan,1974-09-24,100,0.2368428069939834,B,ashelton@example.net


In [4]:
dataset_scores = dict()

### Checking if there are any missing values

In [8]:
data.replace(MISSING_SYMBOLS, np.nan, inplace=True)

In [9]:
data.isna().sum()

name         10
surname      10
birthdate    10
results1     10
results2     10
category     10
email        10
dtype: int64

In [10]:
dataset_scores["missing_percentage"] = data.isna().sum().sum()/data.size
dataset_scores["most_missing_column"] = data.isna().sum().max()/data.shape[0]

Other ideas: correlation, not good dates, mishmashed formats, upper and lower cased...

### Checking for formatting

In [11]:
# Here we check if columns only with numbers. If so, they are formatted as float64
# IMPORTANT! If the column is only numerical and an identifier, its name must be
# listed below. Otherwise, it will be added into outlier calculation.

categorical_variables = ['name']

In [12]:
string_columns = data.select_dtypes(include='object').columns

# Convert string columns to numeric if they contain only numbers or 'NaN'
for col in string_columns:
    try:
        # Skip conversion for columns in list A
        if col in categorical_variables:
            continue
            
        # Check if there are any numbers in the column
        if pd.to_numeric(data[col], errors='coerce').notna().any():
            data[col] = pd.to_numeric(data[col], errors='coerce')
        else:
            # If no numbers found, keep the column as an object
            data[col] = data[col].astype('object')
    except ValueError:
        print(f"Unable to convert column '{col}' to numeric.")

# Check the result
print("Updated DataFrame:")
print(data)

Updated DataFrame:
        name   surname   birthdate  results1  results2 category  \
0    Anthony   Hampton  1970-04-13      23.0  0.218977        C   
1      Brian      Odom  1991-11-22      11.0  0.386655        C   
2    Cameron   Meadows  1950-05-13       3.0  0.628961      NaN   
3       Jack   Johnson  1974-12-21      13.0       NaN        C   
4   Veronica  Sullivan  1974-09-24     100.0  0.236843        B   
..       ...       ...         ...       ...       ...      ...   
95      Anne   Andrews  1925-01-28      36.0  0.014766        A   
96   William     Brown  1964-08-11       6.0  0.445439        C   
97   Kenneth     Baker  1998-07-05      78.0  0.919524        B   
98      Carl  Fletcher  2012-09-09       5.0  0.657009        A   
99     Tracy       NaN         NaN       7.0       NaN        C   

                            email  
0   rodriguezjonathan@example.com  
1         rhondamccoy@example.org  
2                             NaN  
3             zgarner@example.ne

### Checking for outliers

In [13]:
# First we select the numeric columns in the data frame

numeric_columns = data.select_dtypes(include=['number']).columns
numeric_columns

Index(['results1', 'results2'], dtype='object')

In [14]:
# Function for outlier detection using descriptive statistics (quantiles)

def identify_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | (column > upper_bound)]

In [15]:
# Identify outliers in each numeric column

outliers_dict = {col: identify_outliers(data[col]) for col in numeric_columns}

In [16]:
numerical_values = len(data[list(outliers_dict.keys())])
outlier_count = 0

for col, outliers in outliers_dict.items():
        print(f"Potential number of outliers in {col}:")
        print(len(outliers))
        print("\n")
        outlier_count = len(outliers) + outlier_count

if numerical_values == 0:
    dataset_scores["outlier_percentage"] = 0
else:
    dataset_scores["outlier_percentage"] = outlier_count/numerical_values

Potential number of outliers in results1:
0


Potential number of outliers in results2:
0




### Duplicate detection (Lower case/higher case)

In [115]:
# Example data
info = {'Name': ['John', 'Mary', 'Bob', 'Bob', 'john', 'mary', 'BOB', 'Thomas', 'thomas', 'THOMAS', 'Thomas', 'Bob'], 
       'Day': ['noOn', 'nOon', 'Noon', 'noon', 'NOon', 'noon', 'noON', 'noon', 'nOON', 'NOON', 'noon', 'NoOn']}

# Create DataFrame
df = pd.DataFrame(info)

# Display the DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
      Name   Day
0     John  noOn
1     Mary  nOon
2      Bob  Noon
3      Bob  noon
4     john  NOon
5     mary  noon
6      BOB  noON
7   Thomas  noon
8   thomas  nOON
9   THOMAS  NOON
10  Thomas  noon
11     Bob  NoOn


In [123]:
n_unique_val = 0
n_variants = 0

def get_case_duplicates(column):
    """
    Get the list of similar values (ignoring NaN) that only differ by capitalization in a column.
    
    Parameters:
    - column: pandas Series, the column to check
    
    Returns:
    - has_duplicates: True if case-insensitive duplicates are found, False otherwise
    - count_duplicates: count of similar values differing only by capitalization
    - duplicate_values: list of similar values differing only by capitalization
    """
    lowercased_values = column.dropna().astype(str).str.lower()
    
    def are_truly_case_duplicates(val1, val2):
        return val1 != val2 and val1.lower() == val2.lower()
    
    duplicated_mask = lowercased_values.duplicated(keep=False)
    has_duplicates = duplicated_mask.any()
    
    count_duplicates = 0
    duplicate_values = []
    
    for val in lowercased_values[duplicated_mask].unique():
        similar_values = lowercased_values[lowercased_values == val].index.tolist()
        if len(similar_values) > 1 and not any(are_truly_case_duplicates(lowercased_values[i], lowercased_values[j]) for i in similar_values for j in similar_values if i != j):
            count_duplicates += 1
            duplicate_values.extend(similar_values)
    
    return has_duplicates, count_duplicates, column[duplicate_values].tolist()

object_columns = data.select_dtypes(include='object').columns

for column_name in object_columns:
    has_duplicates, count_duplicates, duplicate_values = get_case_duplicates(data[column_name])

    if has_duplicates:
        print(f"The column '{column_name}' has {count_duplicates} truly similar values differing only by capitalization.")
        print(f"List of case-sensitive duplicate values: {duplicate_values}")
        unique_values = set(duplicate_values)
        unique_list = list(unique_values)
        print('\n')
        print(f"Unique values: {unique_list}")
        
        n_unique_val = n_unique_val + count_duplicates
        n_variants = n_variants + len(unique_list)
        
    else:
        print(f"The column '{column_name}' does not have truly similar values differing only by capitalization.")
    print("\n")  # Add a separator for better readability

The column 'name' has 10 truly similar values differing only by capitalization.
List of case-sensitive duplicate values: ['Anthony', 'Anthony', 'Cameron', 'Cameron', 'Kimberly', 'Kimberly', 'Kyle', 'Kyle', 'John', 'John', 'James', 'James', 'Matthew', 'Matthew', 'Matthew', 'Matthew', 'Anne', 'Anne', 'Bradley', 'Bradley', 'Victoria', 'Victoria']


Unique values: ['Kimberly', 'James', 'John', 'Anthony', 'Victoria', 'Cameron', 'Matthew', 'Anne', 'Bradley', 'Kyle']


The column 'surname' has 7 truly similar values differing only by capitalization.
List of case-sensitive duplicate values: ['Morris', 'Morris', 'Williams', 'Williams', 'Williams', 'Gonzalez', 'Gonzalez', 'Jones', 'Jones', 'Jones', 'Jones', 'Allen', 'Allen', 'Lewis', 'Lewis', 'Brown', 'Brown', 'Brown']


Unique values: ['Gonzalez', 'Jones', 'Williams', 'Morris', 'Brown', 'Allen', 'Lewis']


The column 'birthdate' does not have truly similar values differing only by capitalization.


The column 'category' has 3 truly similar valu

In [124]:
if n_variants == 0:
    dataset_scores["duplicate_proportion"] = 0
else:
    dataset_scores["duplicate_proportion"] = 1 - n_unique_val/n_variants

In [122]:
dataset_scores["duplicate_proportion"]

0.7222222222222222

### Creating badges

In [17]:
# save obtained scores to json
import json
 
filename="./badge_data.json"
json_object = json.dumps(dataset_scores, indent=4)
 
with open(filename, "w") as outfile:
    outfile.write(json_object)

In [18]:
repo_url = "https://github.com/annapanfil/data_quality_labeler" #todo: get dinamically

ownername = repo_url.split("/")[3]
repo_name = repo_url.split("/")[4]
branch = "main"


print("To add badges paste this to your readme.md file:")
for badge in dataset_scores.keys():
    print(f"![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2F{ownername}%2F{repo_name}%2F{branch}%2F{filename}&query=%24.{badge}&label={badge})")


To add badges paste this to your readme.md file:
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.missing_percentage&label=missing_percentage)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.most_missing_column&label=most_missing_column)
![DQ Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2Fannapanfil%2Fdata_quality_labeler%2Fmain%2F./badge_data.json&query=%24.outlier_percentage&label=outlier_percentage)


In [None]:
# Example data
info = {'Name': ['John', 'Mary', 'Bob', 'Bob', 'john', 'mary', 'BOB', 'Thomas', 'thomas', 'THOMAS', 'Thomas', 'Bob'], 
       'Day': ['noOn', 'nOon', 'Noon', 'noon', 'NOon', 'noon', 'noON', 'noon', 'nOON', 'NOON', 'noon', 'NoOn']}

# Create DataFrame
df = pd.DataFrame(info)

# Display the DataFrame
print("Original DataFrame:")
print(df)