# Purpose

https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/BSAPUFS/Carrier_Line_Items
https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/BSAPUFS/Downloads/2010_Carrier_Data_Dictionary.pdf

https://docs.greatexpectations.io/en/latest/how_to_guides/creating_and_editing_expectations/how_to_quickly_explore_data_using_expectations_in_a_notebook.html

In [1]:
# Initialize

In [20]:
%run jpinit.ipynb

import great_expectations as ge
import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Data

In [23]:
df_raw = ge.read_csv('/Users/bkraft/projects/data_integrity/data/medicare/bsa_carrier_line/2010_BSA_Carrier_PUF.csv')

In [49]:
df_raw.CAR_LINE_HCPCS_CD.value_counts(normalize=True)

99214    3.769765e-02
99213    3.721579e-02
A0425    1.762883e-02
88305    1.628285e-02
99203    1.618076e-02
             ...     
45020    3.569312e-07
27403    3.569312e-07
51992    3.569312e-07
50785    3.569312e-07
93024    3.569312e-07
Name: CAR_LINE_HCPCS_CD, Length: 4900, dtype: float64

In [47]:
df_raw.CAR_LINE_BETOS_CD.value_counts(normalize=True)

M1B    1.050716e-01
T1H    9.279213e-02
P6C    4.518821e-02
M2B    4.292527e-02
M1A    4.194799e-02
           ...     
P5D    9.280212e-05
P2C    7.674022e-05
P9B    1.177873e-05
O1F    6.424762e-06
M5A    3.569312e-07
Name: CAR_LINE_BETOS_CD, Length: 98, dtype: float64

In [48]:
df_raw.CAR_LINE_PRVDR_TYPE_CD.value_counts(normalize=True)

1    0.721387
0    0.107709
5    0.104827
7    0.038281
3    0.027411
8    0.000385
Name: CAR_LINE_PRVDR_TYPE_CD, dtype: float64

# Pandas Profiler

In [5]:
from great_expectations import profile as p

In [24]:
context = ge.data_context.DataContext()

In [26]:
context.get_expectation_suite('2010_BSA_Carrier_PUF.warning');

### Generate Basic Profile Suite

In [27]:
dataset = df_raw.copy()

suite, validation_result = p.BasicSuiteBuilderProfiler().profile(dataset)
suite.expectation_suite_name = 'basic_profiler'

In [28]:
dataset.validate(expectation_suite=suite);

In [31]:
dataset.save_expectation_suite(filepath='/Users/bkraft/projects/data_integrity/git/great_expectations/expectations/2010_BSA_Carrier_PUF/basic_profiler.json')

# Create batch_kwargs

In [32]:
batch_kwargs = {
          "data_asset_name": "2010_BSA_Carrier_PUF.csv",
          "datasource": "bsa_carrier_line__dir",
          "path": "/Users/bkraft/projects/data_integrity/data/medicare/bsa_carrier_line/2010_BSA_Carrier_PUF.csv"
}

In [36]:
context.list_expectation_suite_names()



In [37]:
suite_basic_profile = context.get_expectation_suite('2010_BSA_Carrier_PUF.basic_profiler')

In [38]:
batch = context.get_batch(batch_kwargs, suite_basic_profile)

In [39]:
validation_result = batch.validate(result_format='COMPLETE')

now = datetime.datetime.now(datetime.timezone.utc)

run_id = {
  "run_name": "basic profile",  # insert your own run_name here
  "run_time": datetime.datetime.now(datetime.timezone.utc)
}

In [40]:
results = context.run_validation_operator("action_list_operator", assets_to_validate=[batch], run_id=run_id)
validation_result_identifier = results.list_validation_result_identifiers()[0]