# Projeto: Engenharia de Dados e Garantia de Qualidade no Conjunto de Dados do Airbnb no Rio de Janeiro

## Aquisição de Dados e Armazenamento de Dados em PostgreSQL - Camada Bronze 

Baixe o conjunto de dados "Inside Airbnb" do Rio de Janeiro da fonte oficial (http://insideairbnb.com/) e promova uma estruturação simples nos dados. Crie um banco de dados PostgreSQL para armazenar os dados brutos das 3 tabelas ("Listing", "Reviews" e Calendar") na camada "bronze".

In [91]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [92]:
from sqlalchemy import create_engine, text as sql_text
import pandas as pd
import datetime

In [93]:
%sql postgresql://postgres:ada1011@localhost/db_bronze

In [94]:
engine_bronze = create_engine('postgresql://postgres:ada1011@localhost/db_bronze')
engine_silver = create_engine('postgresql://postgres:ada1011@localhost/db_silver')
engine_gold = create_engine('postgresql://postgres:ada1011@localhost/db_gold')

In [5]:
df_listings = pd.read_csv("./data/listings.csv")
df_calendar = pd.read_csv("./data/calendar.csv")
df_reviews = pd.read_csv("./data/reviews.csv")

In [6]:
df_listings.to_sql('listings', engine_bronze, if_exists='replace', index=False)

136

In [7]:
df_calendar.to_sql('calendar', engine_bronze, if_exists='replace', index=False)

976

In [8]:
df_reviews.to_sql('reviews', engine_bronze, if_exists='replace', index=False)

307

## Data Clean - Camada Silver

In [45]:
query = """
SELECT *
FROM listings
"""

df_silver_listing = pd.read_sql(sql=sql_text(query), con=engine_bronze.connect())

In [46]:
# Trocando 't' e 'f' p/ True e False nas colunas booleanas 
df_silver_listing['host_is_superhost'] = df_silver_listing['host_is_superhost'].map({'t': True, 'f': False})
df_silver_listing['host_has_profile_pic'] = df_silver_listing['host_has_profile_pic'].map({'t': True, 'f': False})
df_silver_listing['host_identity_verified'] = df_silver_listing['host_identity_verified'].map({'t': True, 'f': False})
df_silver_listing['has_availability'] = df_silver_listing['has_availability'].map({'t': True, 'f': False})
df_silver_listing['instant_bookable'] = df_silver_listing['instant_bookable'].map({'t': True, 'f': False})
df_silver_listing['price'] = df_silver_listing['price'].str.replace('$', '').str.replace(',', '').astype(float)
df_silver_listing['reviews_per_month'] = df_silver_listing['reviews_per_month'].astype(float).fillna(0)

In [47]:
import sqlalchemy as sqlal

dict_dtype_listing = {'id': sqlal.types.BIGINT(),
'listing_url': sqlal.types.TEXT(),
'scrape_id': sqlal.types.BIGINT(),
'last_scraped': sqlal.types.DATE(),
'source': sqlal.types.TEXT(),
'name': sqlal.types.TEXT(),
'description': sqlal.types.TEXT(),
'neighborhood_overview': sqlal.types.TEXT(),
'picture_url': sqlal.types.TEXT(),
'host_id': sqlal.types.INTEGER(),
'host_url': sqlal.types.TEXT(),
'host_name': sqlal.types.TEXT(),
'host_since': sqlal.types.DATE(),
'host_location': sqlal.types.TEXT(),
'host_about': sqlal.types.TEXT(),
'host_response_time': sqlal.types.TEXT(),
'host_response_rate': sqlal.types.TEXT(),
'host_acceptance_rate': sqlal.types.TEXT(),
'host_is_superhost':sqlal.types.Boolean(),
'host_thumbnail_url': sqlal.types.TEXT(),
'host_picture_url': sqlal.types.TEXT(),
'host_neighbourhood': sqlal.types.TEXT(),
'host_listings_count': sqlal.types.TEXT(),
'host_total_listings_count': sqlal.types.TEXT(),
'host_verifications': sqlal.types.TEXT(),
'host_has_profile_pic':sqlal.types.Boolean(),
'host_identity_verified':sqlal.types.Boolean(),
'neighbourhood': sqlal.types.TEXT(),
'neighbourhood_cleansed': sqlal.types.TEXT(),
'neighbourhood_group_cleansed': sqlal.types.TEXT(),
'latitude': sqlal.types.FLOAT(),
'longitude': sqlal.types.FLOAT(),
'property_type': sqlal.types.TEXT(),
'room_type': sqlal.types.TEXT(),
'accommodates': sqlal.types.INTEGER(),
'bathrooms': sqlal.types.NUMERIC(),
'bathrooms_text': sqlal.types.TEXT(),
'bedrooms': sqlal.types.INTEGER(),
'beds': sqlal.types.INTEGER(),
'amenities': sqlal.types.JSON(),
'price': sqlal.types.TEXT(),
'minimum_nights': sqlal.types.INTEGER(),
'maximum_nights': sqlal.types.INTEGER(),
'minimum_minimum_nights': sqlal.types.INTEGER(),
'maximum_minimum_nights': sqlal.types.INTEGER(),
'minimum_maximum_nights': sqlal.types.INTEGER(),
'maximum_maximum_nights': sqlal.types.INTEGER(),
'minimum_nights_avg_ntm': sqlal.types.Numeric(),
'maximum_nights_avg_ntm': sqlal.types.Numeric(),
'calendar_updated': sqlal.types.DATE(),
'has_availability':sqlal.types.Boolean(),
'availability_30': sqlal.types.INTEGER(),
'availability_60': sqlal.types.INTEGER(),
'availability_90': sqlal.types.INTEGER(),
'availability_365': sqlal.types.INTEGER(),
'calendar_last_scraped': sqlal.types.DATE(),
'number_of_reviews': sqlal.types.INTEGER(),
'number_of_reviews_ltm': sqlal.types.INTEGER(),
'number_of_reviews_l30d': sqlal.types.INTEGER(),
'first_review': sqlal.types.DATE(),
'last_review': sqlal.types.DATE(),
'review_scores_rating': sqlal.types.TEXT(),
'review_scores_accuracy': sqlal.types.TEXT(),
'review_scores_cleanliness': sqlal.types.TEXT(),
'review_scores_checkin': sqlal.types.TEXT(),
'review_scores_communication': sqlal.types.TEXT(),
'review_scores_location': sqlal.types.TEXT(),
'review_scores_value': sqlal.types.TEXT(),
'license': sqlal.types.TEXT(),
'instant_bookable':sqlal.types.Boolean(),
'calculated_host_listings_count': sqlal.types.INTEGER(),
'calculated_host_listings_count_entire_homes': sqlal.types.INTEGER(),
'calculated_host_listings_count_private_rooms': sqlal.types.INTEGER(),
'calculated_host_listings_count_shared_rooms': sqlal.types.INTEGER(),
'reviews_per_month': sqlal.types.Numeric()
             }

In [48]:
df_silver_listing.to_sql('listing', engine_silver, if_exists='replace', index=False, dtype = dict_dtype_listing)

136

In [49]:
query = """
SELECT *
FROM reviews
"""

df_silver_reviews = pd.read_sql(sql=sql_text(query), con=engine_bronze.connect())

In [50]:
dict_dtype_reviews={
'listing_id': sqlal.types.BIGINT(),
'id':sqlal.types.BIGINT(),
'date':sqlal.types.DATE(),
'reviewer_id':sqlal.types.BIGINT(),
'reviewer_name':sqlal.types.TEXT(),
'comments':sqlal.types.TEXT()
}

In [52]:
df_silver_reviews['listing_id']=df_silver_reviews['listing_id'].astype(int).dropna()
df_silver_reviews['id']=df_silver_reviews['id'].astype(int).dropna()
df_silver_reviews['reviewer_id']=df_silver_reviews['reviewer_id'].astype(int).dropna()
df_silver_reviews['reviewer_name']=df_silver_reviews['reviewer_name'].astype(str).dropna()
df_silver_reviews['comments']=df_silver_reviews['comments'].astype(str).dropna()


In [53]:
df_silver_reviews.to_sql('review', engine_silver, if_exists='replace', index=False, dtype = dict_dtype_reviews)

307

In [54]:
query = """
SELECT *
FROM calendar
WHERE date >= '2023-07-25' AND date <= '2023-10-25'
"""

df_silver_calendar = pd.read_sql(sql=sql_text(query), con=engine_bronze.connect())

In [55]:
df_silver_calendar['available'] = df_silver_calendar['available'].map({'t': True, 'f': False})
df_silver_calendar['adjusted_price'] = df_silver_calendar['adjusted_price'].str.replace('$', '')
df_silver_calendar['price'] = df_silver_calendar['price'].str.replace('$', '')
df_silver_calendar['adjusted_price'] = df_silver_calendar['adjusted_price'].str.replace(',', '')
df_silver_calendar['price'] = df_silver_calendar['price'].str.replace(',', '')

df_silver_calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,17878,2023-09-23,False,265.0,265.0,5.0,28.0
1,17878,2023-09-24,True,265.0,265.0,5.0,28.0
2,17878,2023-09-25,False,290.0,290.0,5.0,28.0
3,17878,2023-09-26,False,290.0,290.0,5.0,28.0
4,17878,2023-09-27,False,290.0,290.0,5.0,28.0


In [56]:
dict_dtype_calendar={
'listing_id': sqlal.types.BIGINT(),
'date':sqlal.types.DATE(),
'available':sqlal.types.BOOLEAN(),
'price':sqlal.types.FLOAT(),
'adjusted_price':sqlal.types.FLOAT(),
'minimum_nights':sqlal.types.INTEGER(),
'maximum_nights':sqlal.types.INTEGER()
}

In [57]:
df_silver_calendar.to_sql('calendar', engine_silver, if_exists='replace', index=False, dtype = dict_dtype_calendar)

715

In [59]:
import great_expectations as gx

context = gx.get_context()

gx_silver_listing = gx.from_pandas(df_silver_listing) #de um dataframe pandas para o formato do "great_expectations"

In [73]:
engine_silver_string = 'postgresql://postgres:ada1011@localhost/db_silver'
engine_gold_string = 'postgresql://postgres:ada1011@localhost/db_gold'

datasource = context.sources.add_postgres(
    name="ge_datasource", connection_string=engine_silver_string
)

DataContextError: Can not write the fluent datasource ge_datasource because a datasource of that name already exists in the data context.

In [95]:
context.list_datasources()

datasource = context.get_datasource("ge_datasource")

In [96]:
asset_name = "silver"
asset_table_name = "calendar"

table_asset = datasource.add_table_asset(name=asset_name, table_name=asset_table_name)

ValueError: "silver" already exists (all existing assets are silver)

In [97]:
context.list_datasources()

[{'type': 'postgres',
  'name': 'ge_datasource',
  'assets': [{'name': 'silver',
    'type': 'table',
    'order_by': [],
    'batch_metadata': {},
    'splitter': {'column_name': 'datetime',
     'method_name': 'split_on_year_and_month_and_day'},
    'table_name': 'calendar',
    'schema_name': None}],
  'connection_string': PostgresDsn('postgresql://postgres:ada1011@localhost/db_silver', )},
 {'type': 'postgres',
  'name': 'ge_datasource_silver',
  'connection_string': PostgresDsn('postgresql://postgres:ada1011@localhost/db_silver', )},
 {'type': 'postgres',
  'name': 'ge_datasource_gold',
  'connection_string': PostgresDsn('postgresql://postgres:ada1011@localhost/db_gold', )}]

In [98]:
my_datasource = context.get_datasource("ge_datasource") #Fonte de dados Postgres
my_table_asset = my_datasource.get_asset(asset_name="silver") #Asset da tabela silver
batch_request = my_table_asset.build_batch_request() #Resgata os dados do asset

In [99]:
context.add_or_update_expectation_suite("suite_silver")

{
  "expectation_suite_name": "suite_silver",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.17.23"
  }
}

In [83]:
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="suite_silver",
)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,17878,2023-09-23,False,265.0,265.0,5,28
1,17878,2023-09-24,True,265.0,265.0,5,28
2,17878,2023-09-25,False,290.0,290.0,5,28
3,17878,2023-09-26,False,290.0,290.0,5,28
4,17878,2023-09-27,False,290.0,290.0,5,28


In [85]:
#expect id to be unique
validator.expect_column_values_to_be_unique(column="listing_id")
#avalability to be boolean
validator.expect_column_values_to_be_of_type(column="available",type_="boolean")
#price to be float and not null
validator.expect_column_values_to_be_of_type(column="price",type_="float")
validator.expect_column_values_to_not_be_null(column="price")
#adjusted_price to be float and not null
validator.expect_column_values_to_be_of_type(column="adjusted_price",type_="float")
validator.expect_column_values_to_not_be_null(column="adjusted_price")
#minimum_nights to be integer and not null
validator.expect_column_values_to_be_of_type(column="minimum_nights",type_="int")
validator.expect_column_values_to_not_be_null(column="minimum_nights")
#maximum_nights to be integer and not null
validator.expect_column_values_to_be_of_type(column="maximum_nights",type_="int")
validator.expect_column_values_to_not_be_null(column="maximum_nights")


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1077715,
    "unexpected_count": 15,
    "unexpected_percent": 0.001391833648042386,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [86]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [88]:
checkpoint = context.add_or_update_checkpoint(
    name="checkpoint_gold_filter",
    validator=validator
)

In [89]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

No recognized sqlalchemy types in type_list for current dialect.
No recognized sqlalchemy types in type_list for current dialect.
No recognized sqlalchemy types in type_list for current dialect.
No recognized sqlalchemy types in type_list for current dialect.
No recognized sqlalchemy types in type_list for current dialect.


In [100]:
silver_asset = my_datasource.get_asset(asset_name="silver")
silver_asset.add_splitter_year_and_month_and_day(column_name="date") #Divide os dados em vários "batchs" ou amostras, por ano, mês e dia

TableAsset(name='silver', type='table', id=None, order_by=[], batch_metadata={}, splitter=SplitterYearAndMonthAndDay(column_name='date', method_name='split_on_year_and_month_and_day'), table_name='calendar', schema_name=None)

In [101]:
my_batch_request = silver_asset.build_batch_request()
batches = my_table_asset.get_batch_list_from_batch_request(my_batch_request) #Retorna uma lista com todos as amostras criadas com o divisor criado anteriormente
batches

[Batch(datasource=PostgresDatasource(type='postgres', name='ge_datasource', id=None, assets=[TableAsset(name='silver', type='table', id=None, order_by=[], batch_metadata={}, splitter=SplitterYearAndMonthAndDay(column_name='date', method_name='split_on_year_and_month_and_day'), table_name='calendar', schema_name=None)], connection_string=PostgresDsn('postgresql://postgres:ada1011@localhost/db_silver', ), create_temp_table=True, kwargs={}), data_asset=TableAsset(name='silver', type='table', id=None, order_by=[], batch_metadata={}, splitter=SplitterYearAndMonthAndDay(column_name='date', method_name='split_on_year_and_month_and_day'), table_name='calendar', schema_name=None), batch_request=BatchRequest(datasource_name='ge_datasource', data_asset_name='silver', options={'year': 2023, 'month': 10, 'day': 1}), data=<great_expectations.execution_engine.sqlalchemy_batch_data.SqlAlchemyBatchData object at 0x7fcb2a0fdc60>, id='ge_datasource-silver-year_2023-month_10-day_1', metadata={'year': 2023

In [102]:
data_assistant_result = context.assistants.onboarding.run(
    batch_request=my_batch_request)




Generating Expectations:   0%|          | 0/8 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/544 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/748 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/748 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/306 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/374 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/306 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/374 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/306 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/374 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/306 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/374 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/476 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/3 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/340 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/340 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/204 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/170 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/340 [00:00<?, ?it/s]

In [103]:
data_assistant_result.plot_metrics()

84 Metrics calculated, 29 Metric plots implemented
Use DataAssistantResult.metrics_by_domain to show all calculated Metrics


interactive(children=(Dropdown(description='Select Plot Type: ', layout=Layout(margin='0px', width='max-conten…



In [104]:
silver_asset = my_datasource.get_asset(asset_name="silver")
batch_request_silver = silver_asset.build_batch_request()

context.add_or_update_expectation_suite("suite_silver_multiple")

validator = context.get_validator(
    batch_request=batch_request_silver,
    expectation_suite_name="suite_silver_multiple",
)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,17878,2023-09-30,False,290.0,290.0,5,28
1,25026,2023-09-30,True,293.0,293.0,2,60
2,35764,2023-09-30,False,192.0,192.0,3,15
3,48305,2023-09-30,False,3657.0,3657.0,2,89
4,48901,2023-09-30,True,788.0,788.0,3,750


In [105]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [106]:

checkpoint = context.add_or_update_checkpoint(
    name="checkpoint_silver_multiple",
    validator=validator
)

In [107]:
checkpoint_result = checkpoint.run()

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]

Calculating Metrics: 0it [00:00, ?it/s]