In [1]:
import logging
from typing import Any, Dict, Tuple
import numpy as np
import pandas as pd
import great_expectations as gx
import mlflow

import os
from pathlib import Path
# Path
raw_data_dir = Path("/Users/antoniooliveira/Documents/GitHub/mlops_project_v2/hotel-california/data/01_raw")
# Change working directory
os.chdir(raw_data_dir)

In [2]:
df = pd.read_csv('X_train.csv')
df_test = pd.read_csv('X_val.csv')

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BookingID,11150.0,25040.999552,8659.122219,10000.0,17532.5,25073.0,32573.5,39997.0
ArrivalYear,11150.0,2016.0,0.0,2016.0,2016.0,2016.0,2016.0,2016.0
ArrivalMonth,11150.0,6.865471,3.18182,1.0,4.0,7.0,10.0,12.0
ArrivalWeekNumber,11150.0,28.893812,13.927088,1.0,17.0,29.0,41.0,53.0
ArrivalDayOfMonth,11150.0,15.843498,8.872837,1.0,8.0,16.0,24.0,31.0
ArrivalHour,11150.0,18.869215,2.884477,14.0,16.5,19.0,21.25,23.75
WeekendStays,11150.0,0.879283,0.938771,0.0,0.0,1.0,2.0,14.0
WeekdayStays,11150.0,2.292466,1.731177,0.0,1.0,2.0,3.0,34.0
Adults,11150.0,1.886996,0.549596,0.0,2.0,2.0,2.0,4.0
Children,11150.0,0.145112,0.458836,0.0,0.0,0.0,0.0,3.0


In [4]:
import great_expectations as ge
import mlflow
import pandas as pd

def unit_test(df: pd.DataFrame):

    mlruns_path = '/Users/antoniooliveira/Documents/GitHub/mlops_project_v2/hotel-california/mlruns'
    mlflow.set_tracking_uri(f'file://{mlruns_path}')

    if mlflow.active_run():
        mlflow.end_run()

    df = df.copy(deep=True)
    mlflow.set_experiment("data_unit_tests")

    with mlflow.start_run(run_name="data_unit_tests_run_") as run:
        mlflow.set_tag("mlflow.runName", "verify_data_quality")

        # Log raw stats
        mlflow.log_dict(df.describe(include='all').to_dict(), "describe_data_raw.json")

        pd_df_gx = gx.dataset.PandasDataset(df)

        # BookingID: integer, unique
        assert pd_df_gx.expect_column_values_to_be_of_type('BookingID', 'int64').success
        assert pd_df_gx.expect_column_values_to_be_unique('BookingID').success

        # ArrivalYear: int, always 2016 (min=max=2016)
        assert pd_df_gx.expect_column_values_to_be_of_type('ArrivalYear', 'int64').success
        assert pd_df_gx.expect_column_values_to_be_between('ArrivalYear', 2016, 2016).success

        # ArrivalMonth: int 1-12
        assert pd_df_gx.expect_column_values_to_be_between('ArrivalMonth', 1, 12).success

        # ArrivalWeekNumber: int 1-53
        assert pd_df_gx.expect_column_values_to_be_between('ArrivalWeekNumber', 1, 53).success

        # ArrivalDayOfMonth: int 1-31
        assert pd_df_gx.expect_column_values_to_be_between('ArrivalDayOfMonth', 1, 31).success

        # ArrivalHour: float or int between 14 and 24
        assert pd_df_gx.expect_column_values_to_be_between('ArrivalHour', 14, 24).success

        # WeekendStays: int >=0 
        assert pd_df_gx.expect_column_values_to_be_between('WeekendStays', 
                                                           min_value=0, 
                                                           max_value=None).success

        # WeekdayStays: int >=0 
        assert pd_df_gx.expect_column_values_to_be_between('WeekdayStays',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # Adults: int >= 0
        assert pd_df_gx.expect_column_values_to_be_between('Adults',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # Children: int >= 0
        assert pd_df_gx.expect_column_values_to_be_between('Children',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # Babies: int >= 0
        assert pd_df_gx.expect_column_values_to_be_between('Babies',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # FirstTimeGuest: binary 0 or 1
        assert pd_df_gx.expect_column_values_to_be_in_set('FirstTimeGuest', [0, 1]).success

        # AffiliatedCustomer: binary 0 or 1
        assert pd_df_gx.expect_column_values_to_be_in_set('AffiliatedCustomer', [0, 1]).success

        # PreviousReservations: int >=0
        assert pd_df_gx.expect_column_values_to_be_between('PreviousReservations',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # PreviousStays: int >=0 
        assert pd_df_gx.expect_column_values_to_be_between('PreviousStays',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # PreviousCancellations: int >=0 
        assert pd_df_gx.expect_column_values_to_be_between('PreviousCancellations',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # DaysUntilConfirmation: int >=0
        assert pd_df_gx.expect_column_values_to_be_between('DaysUntilConfirmation',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # OnlineReservation: binary 0 or 1
        assert pd_df_gx.expect_column_values_to_be_in_set('OnlineReservation', [0, 1]).success

        # BookingChanges: int >=0
        assert pd_df_gx.expect_column_values_to_be_between('BookingChanges',
                                                           min_value=0, 
                                                           max_value=None).success
        # BookingToArrivalDays: int >=0 (max 365)
        assert pd_df_gx.expect_column_values_to_be_between('BookingToArrivalDays', 0, 365).success

        # ParkingSpacesBooked: binary 0 or 1
        assert pd_df_gx.expect_column_values_to_be_in_set('ParkingSpacesBooked', [0, 1]).success

        # SpecialRequests: int >=0 (max 5)
        assert pd_df_gx.expect_column_values_to_be_between('SpecialRequests', 0, 5).success

        # PartOfGroup: binary 0 or 1
        assert pd_df_gx.expect_column_values_to_be_in_set('PartOfGroup', [0, 1]).success

        # OrderedMealsPerDay: int >=0
        assert pd_df_gx.expect_column_values_to_be_between('OrderedMealsPerDay',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # FloorReserved: int 0-6
        assert pd_df_gx.expect_column_values_to_be_between('FloorReserved', 0, 6).success

        # FloorAssigned: int -1 to 6
        assert pd_df_gx.expect_column_values_to_be_between('FloorAssigned', -1, 6).success

        # DailyRateEuros: float 0 >=
        assert pd_df_gx.expect_column_values_to_be_between('DailyRateEuros',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # DailyRateUSD: float 0 >=
        assert pd_df_gx.expect_column_values_to_be_between('DailyRateUSD',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # %PaidinAdvance: float 0-1
        assert pd_df_gx.expect_column_values_to_be_between('%PaidinAdvance', 0, 1).success

        # CountryofOriginAvgIncomeEuros (Year-2): float >=0
        assert pd_df_gx.expect_column_values_to_be_between('CountryofOriginAvgIncomeEuros (Year-2)',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # CountryofOriginAvgIncomeEuros (Year-1): float 0 >=
        assert pd_df_gx.expect_column_values_to_be_between('CountryofOriginAvgIncomeEuros (Year-1)',
                                                           min_value=0, 
                                                           max_value=None).success
        
        # CountryofOriginHDI (Year-1): float 0 - 1
        assert pd_df_gx.expect_column_values_to_be_between('CountryofOriginHDI (Year-1)', 0, 1).success



        
         # Log the cleaned data statistics
        describe_to_dict=df.describe().to_dict()
        mlflow.log_dict(describe_to_dict,"stats_data_cleaned.json")
        
    mlflow.end_run()
    log = logging.getLogger(__name__)
    log.info("Success")

    return "All data quality tests passed successfully."
        
    

In [5]:
unit_test(df)

'All data quality tests passed successfully.'

In [7]:
unit_test(df_test)

AssertionError: 

In [35]:
os.getcwd()

'/Users/antoniooliveira/Documents/GitHub/mlops_project_v2/hotel-california/data/01_raw'

In [24]:
import mlflow
import pandas as pd
import great_expectations as gx  # make sure this matches usage


In [25]:
y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')

In [26]:
unit_test_y(y_val)

NameError: name 'mlruns_path' is not defined