In [1]:
##import data_quality as dq
import pandas as pd
import urllib.request
from zipfile import ZipFile
from importlib import reload

from Framework.data_quality import DataQualityChecker
from Visualization.dashboard import Dashboard

In [2]:
# Download the ZIP file
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
filename = "AirQualityUCI.zip"
urllib.request.urlretrieve(url, filename)

# Extract the CSV file from the ZIP file
with ZipFile(filename, "r") as zip_file:
    csv_file = zip_file.open("AirQualityUCI.csv")

    # Read the CSV file into a DataFrame
    data = pd.read_csv(csv_file, sep=";", decimal=",")


# Create an instance of DataQualityChecker
checker = DataQualityChecker(data)
columns_of_interest = ["CO(GT)", "PT08.S1(CO)", "NMHC(GT)", "C6H6(GT)", "PT08.S2(NMHC)", "NOx(GT)",
                       "PT08.S3(NOx)", "NO2(GT)", "PT08.S4(NO2)", "PT08.S5(O3)", "T", "RH", "AH"]

# Add expectation
Consistency_score = checker.calculate_consistency_scores(columns_of_interest)

between_expectation = checker.expect_column_min_to_be_between("PT08.S1(CO)",800,1000)

Relevancy_score = checker.calculate_relevancy_scores(columns_of_interest,3)



report = checker.run_checks()
print(report)

[{'Column': 'CO(GT)', 'RelevancyScore': 9357}, {'Column': 'PT08.S1(CO)', 'RelevancyScore': 8990}, {'Column': 'NMHC(GT)', 'RelevancyScore': 9087}, {'Column': 'C6H6(GT)', 'RelevancyScore': 8991}, {'Column': 'PT08.S2(NMHC)', 'RelevancyScore': 8982}, {'Column': 'NOx(GT)', 'RelevancyScore': 9247}, {'Column': 'PT08.S3(NOx)', 'RelevancyScore': 8935}, {'Column': 'NO2(GT)', 'RelevancyScore': 9357}, {'Column': 'PT08.S4(NO2)', 'RelevancyScore': 8991}, {'Column': 'PT08.S5(O3)', 'RelevancyScore': 9338}, {'Column': 'T', 'RelevancyScore': 8991}, {'Column': 'RH', 'RelevancyScore': 8991}, {'Column': 'AH', 'RelevancyScore': 8991}]
[]
