# Data Quality con Great Expectations

In [None]:
#! pip install great_expectations

In [None]:
import os
import sys
import pandas as pd
import datetime
from datetime import date
import re
from ruamel import yaml
from ruamel.yaml import YAML
sys.path.append('../../')

## Ge
import great_expectations as gx
import great_expectations as gx
import great_expectations.jupyter_ux
from great_expectations.cli.datasource import sanitize_yaml_and_save_datasource, check_if_datasource_name_exists
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.data_context.types.resource_identifiers import ExpectationSuiteIdentifier
from great_expectations.exceptions import DataContextError
from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult


from pprint import pprint




## Data Quality 

Dopo aver completato la fase di Data Ingestion tramite lo scraping il secondo step è quello di valutare la qualità dei dati raccolti.  
Il dataframe studiato presenta 18 colonne. Ogni colonna presenta delle differenze dalle altre e ci si aspetta determinati valori da queste.

Dopo aver settato:
1. la datasource
2. suite (e le relative expectations)
3. il checkpoint

Si condurrà una validazione del sample

In [None]:
path = '../data'
data_asset_name  = 'scraping_all_20230216'

file_name = data_asset_name +'.csv'

expectation_suite_name = 'Main'
print(file_name)

df_append = pd.read_csv(path + '/' + file_name )

## Istruzioni operative
E' necessario dichiare il data_asset_name e la expectation_suite_name che si vuole utilizzare

In [None]:
context = gx.get_context()

## Configurazione della datasource 

In [None]:
datasource_config: dict = {
    "name": "glassdoor_scraping",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "class_name": "PandasExecutionEngine",
        "module_name": "great_expectations.execution_engine",
    },
     "data_connectors": {
        "all": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": "../data",
            "default_regex": {"pattern": "(.*)\\.csv",
            "group_names": ["data_asset_name"]},
            #"batch_spec_passthrough": {
            #    "reader_method": "read_csv",
            #    "reader_options": {
            #        "header": True,
            #        "inferSchema": True,
            #    },
            #},
        }
    },
}

In [None]:
#check configurazione
context.test_yaml_config(yaml.dump(datasource_config))

In [None]:
## try to add
try:
    context.get_datasource(datasource_config["name"])
except ValueError:
    context.add_datasource(**datasource_config)
else:
    print(
        f"The datasource {datasource_config['name']} already exists in your Data Context!"
    )

In [None]:
# context.add_datasource(**datasource_config) # usa questo comando per sovrascrive la data source 

In [None]:
context.list_datasources()

## Creazione della Suite
Dopo aver configurato la data source creaimo le suite e le aspettive
[Galleria delle expectations](https://greatexpectations.io/expectations/)

In [None]:
context = gx.get_context()


# Feel free to change the name of your suite here. Renaming this will not remove the other one.
#expectation_suite_name = "Main"
try:
    suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Loaded ExpectationSuite "{suite.expectation_suite_name}" containing {len(suite.expectations)} expectations.')
except DataContextError:
    suite = context.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')

## 1 Table Expectation

#### Questa Expectation verifica che le colonne nella tabella abbiano questo ordine
- company
- job_title
- location
- company_rating
- job_age
- job_link
- Sedeentrale'
- Dimensioni
- Fondatael'
- Tipo
- Entrate
- Opportunitài carriera'
- Stipendio benefit'
- Cultura valori'
- Dirigentienior'
- Equilibrioavoro/vita privata'
- Settore
- Segmento
- scraping_date

In [None]:
# Create an Expectation
expectation_configuration = ExpectationConfiguration(
   # Name of expectation type being added
   expectation_type="expect_table_columns_to_match_ordered_list",
   # These are the arguments of the expectation
   # The keys allowed in the dictionary are Parameters and
   # Keyword Arguments of this Expectation Type
   kwargs={
      "column_list": ['company',
                     'job_title',
                     'location',
                     'company_rating',
                     'job_age',
                     'job_link',
                     'run_date',
                     'oppurtunita_carriera',
                     'stipendio_e_benefit',
                     'cultura_e_valori',
                     'dirigenti_senior',
                     'equilibrio_lavoro_vita_privata',
                     'sede_centrale',
                     'dimensioni',
                     'tipo',
                     'entrate',
                     'fondata_nel',
                     'settore',
                     'segmento',
                     'scraping_date']
   },
   # This is how you can optionally add a comment about this expectation.
   # It will be rendered in Data Docs.
   # See this guide for details:
   # `How to add comments to Expectations and display them in Data Docs`.
   meta={
      "notes": {
         "format": "markdown",
         "content": ""
      }
   }
)
# Add the Expectation to the suite
suite.add_expectation(expectation_configuration=expectation_configuration)

## 2. Column Expectation

### Company

##### company  
    Questa Expectation verifica che la colonna sia di tipo stringa

In [None]:
expectation_configuration = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_in_type_list",
   kwargs={
        "column": "company",
      "type_list": ['str'],
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration)

### Job Title

##### job_title  
    Questa Expectation verifica che la colonna si di tipo stringa

In [None]:
expectation_configuration = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_in_type_list",
   kwargs={
        "column": "job_title",
      "type_list": ['str'],
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration)

### Tutte le colonne

##### All_columns
    Questa Expectation verifica che le colonne non siano mai nulle

In [None]:
columns = list(df_append.columns)
for e in columns:
    
    expectation_configuration = ExpectationConfiguration(
       expectation_type="expect_column_values_to_not_be_null",
       kwargs={
          "column": e,
       },
       # Note optional comments omitted
    )
    suite.add_expectation(expectation_configuration=expectation_configuration)

### Colonne di tipo stringa

##### string_columns
    Questa Expectation verifica che le colonne di tipo stringa siano stringhe

In [None]:
string_columns = [
    'company', 'job_title', 'location', 'job_age',
       'job_link', 'sede_centrale', 'dimensioni', 'tipo', 'settore', 'segmento', 'entrate']

for e in string_columns:
    
    expectation_configuration = ExpectationConfiguration(
       expectation_type="expect_column_values_to_be_in_type_list",
       kwargs={
            "column": e,
          "type_list": ['str'],
       },
       # Note optional comments omitted
    )
    suite.add_expectation(expectation_configuration=expectation_configuration)

### Colonne di tipo float

##### float_columns
    Questa Expectation verifica che le colonne di tipo float siano float

In [None]:
columns_float = [
        'company_rating','oppurtunita_carriera', 'stipendio_e_benefit',
       'cultura_e_valori', 'dirigenti_senior',
       'equilibrio_lavoro_vita_privata']
for e in columns_float:
    
    expectation_configuration = ExpectationConfiguration(
       expectation_type="expect_column_values_to_be_in_type_list",
       kwargs={
            "column": e,
          "type_list": ['float'],
       },
       # Note optional comments omitted
    )
    suite.add_expectation(expectation_configuration=expectation_configuration)

##### float_columns
    Questa Expectation verifica che le colonne di tipo float siano valorizzati con un numero maggiore di zero

In [None]:
for e in columns_float:
    
    expectation_configuration = ExpectationConfiguration(
       expectation_type="expect_column_min_to_be_between",
       kwargs={
            "column": e,
          "min_value": 0,
       },
       # Note optional comments omitted
    )
    suite.add_expectation(expectation_configuration=expectation_configuration)

### Colonne dei rating
Le colonne "rating" sono le seguenti:
- company_rating
- oppurtunita_carriera 
- stipendio_e_benefit
- cultura_e_valori 
- dirigenti_senior
- equilibrio_lavoro_vita_privata

##### rating_columns
    Questa Expectation verifica che le colonne di tipo float siano valorizzati con un numero maggiore di zero

In [None]:
rating_columns = [
            'company_rating', 'oppurtunita_carriera', 'stipendio_e_benefit',
           'cultura_e_valori', 'dirigenti_senior',
           'equilibrio_lavoro_vita_privata']
for e in columns_float:
    
    expectation_configuration = ExpectationConfiguration(
       expectation_type="expect_column_values_to_be_between",
       kwargs={
            "column": e,
            "min_value":0.0,
            "max_value":5.0
       },
       # Note optional comments omitted
    )
    suite.add_expectation(expectation_configuration=expectation_configuration)

### fondata_nel

##### fondata_nel
    Questa Expectation verifica che la colonna fondata_nel rispetti al regex [0-9]{4}

In [None]:
expectation_configuration = ExpectationConfiguration(
   expectation_type="expect_column_values_to_match_regex",
   kwargs={
        "column": 'fondata_nel',
       "regex":"[0-9]{4}"
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration)

### scraping_date

##### scraping_date
    Questa Expectation verifica che i valori dell colonna scrapinp_date siano datetime64[ns]

In [None]:
expectation_configuration = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_in_type_list",
   kwargs={
        "column": 'scraping_date',
      "type_list": ['str'],
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration)

##### scraping_date
    Questa Expectation verifica che la colonna scraping_date rispetti il formato %Y-%m-%d



In [None]:
expectation_configuration = ExpectationConfiguration(
   expectation_type="expect_column_values_to_match_strftime_format",
   kwargs={
        "column": 'scraping_date',
         "strftime_format":"%Y-%m-%d"
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration)

### Salvataggio delle suite e creazione della documentazione
rimuovere il commento dalla prima riga per la costruzione della suite

In [None]:
# print(context.get_expectation_suite(expectation_suite_name=expectation_suite_name))
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name)
context.build_data_docs(resource_identifiers=[suite_identifier])
context.open_data_docs(resource_identifier=suite_identifier)

### Apri questo link per vedere le expectations create

In [None]:
print('http://localhost:9000/view/great_expectations/uncommitted/data_docs/local_site/expectations/'+ expectation_suite_name +'.html')

Se non si sta utilizzando il notebook dal docker si può arrivare al sito tramite il percorso 'great_expectations\uncommitted\data_docs\local_site'

## Creazione di un checkpoint

In [None]:
## checkpoint
yaml = YAML()

In [None]:
my_checkpoint_name = "main_checkpoint" # This was populated from your CLI command.

yaml_config = f"""
name: {my_checkpoint_name}
config_version: 1.0
class_name: SimpleCheckpoint
run_name_template: "%Y%m%d-%H%M%S-{data_asset_name}"
validations:
  - batch_request:
      datasource_name: glassdoor_scraping 
      data_connector_name: all
      data_asset_name: {data_asset_name}
      data_connector_query:
        index: -1
    expectation_suite_name: Main
"""

In [None]:
# Run this cell to print out the names of your Datasources, Data Connectors and Data Assets
pprint(context.get_available_data_asset_names())

In [None]:
context.list_expectation_suite_names()

In [None]:
main_checkpoint = context.test_yaml_config(yaml_config=yaml_config)

In [None]:
#print(main_checkpoint.get_config(mode="yaml"))


In [None]:
context.add_or_update_checkpoint(**yaml.load(yaml_config))
context.list_checkpoints()

[Come aggiungere dati o suite di convalida a un Checkpoint](https://docs.greatexpectations.io/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint

In [None]:
try:
    results = context.run_checkpoint(
        checkpoint_name="main_checkpoint",
        expectation_suite_name = 'Main'
    )
except:
    print('ko')

In [None]:
results = context.run_checkpoint(
    checkpoint_name="main_checkpoint",
    expectation_suite_name = 'Main'
)


In [None]:
# A questo punto la validazione è completata, puoi consultare i risultati

In [None]:
a = re.findall('[0-9]{8}-[0-9]{6}-' + data_asset_name + '/[0-9]{8}T[0-9]{6}.[0-9]{6}Z/[\S]{32}', str(list(results['run_results'])[0]))[0]
b = expectation_suite_name + '/' + a
path = r"../great_expectations/uncommitted/validations/" + b + '.json'

### Apri questo link per vedere le expectations create

In [None]:
print('http://localhost:9000/view/great_expectations/uncommitted/data_docs/local_site/expectations/'+ expectation_suite_name +'.html')

### Se non stai usando docker vai in questo path 

In [None]:
print("great_expectations/uncommitted/data_docs/local_site/expectations/"+ expectation_suite_name +'.html')

### Apri questo link per vedere i risultati


In [None]:
print( "http://localhost:9000/view/great_expectations/uncommitted/data_docs/local_site/validations/" + b + '.html')

### Se non stai usando docker vai in questo path 

In [None]:
print("great_expectations/uncommitted/data_docs/local_site/validations/" + b + '.html')