## KNA1 Data Quality

**Created By**: Maddie Johnson<br>
 **Edited By**: Maddie Johnson<br>
 **Created on**: July 17, 2023<br>
 **Edited on**: July 19, 2023<br>

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import and Initialization

In [2]:
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
from google.cloud import bigquery          #The BigQuery API
import datetime as dt #For inputing any date or time information
from datetime import timezone #For inputing timezones
import matplotlib.pyplot as plt      #For plotting
import numpy as np                         #For scientific computation
import pydata_google_auth                  #For authentication against Google 
import pandas as pd                        #For data manipulation and bgq --> pandas conversion
import pandas_gbq as pd_gbq #Import BigQuery data to create a pandas dataframe
import plotly.graph_objects as go # interactive plots
import pytz #For timezone calculations
import seaborn as sns                #For plotting
import warnings


import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
    os.getcwd(),
    os.pardir)
)
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

In [3]:
import utils as ut

In [4]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Initialize BigQuery Connection

In [5]:
## Actual authentication step - Authentication token is saved on server running Jupyter (pydata_google_auth may not work in Vertex AI unless there is a token generated)
credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery'])

In [6]:
## Update project ID depending on which datasets you are needing to access
project_id = 'edr-gfssdm-pr-cah'

## Set Data Query Logic

In the SQL query make sure to only include columns that need to be analyzed from the table where data quality is being checked.

In [7]:
table_query = """
SELECT DISTINCT
    KUNNR,
    NAME1,
    NAME2,
    STRAS,
    ORT01,
    PSTLZ,
    REGIO,
    LAND1,
    KTOKD,
    BRSCH,
    LOEVM
    
    
    
FROM
  `edna-data-pr-cah.VI0_PHM_ORP_PE1_PH1_NP.KNA1_CV`
LIMIT 
    100
    
    """

## Load Data

In [8]:
## Pandas read_gbq to read SQL query above and put into a DataFrame
table_df = pd.read_gbq(table_query, project_id=project_id, dialect='standard', credentials=credentials)

In [9]:
table_df

Unnamed: 0,KUNNR,NAME1,NAME2,STRAS,ORT01,PSTLZ,REGIO,LAND1,KTOKD,BRSCH,LOEVM
0,2150417241,MISSION DRUG LLC 340B,,110 N MAIN ST,SAINT IGNATIUS,59865-9031,MT,US,ZPSH,3401,
1,3000045942,APPLETON HEART INSTITUTE,,1818 N MEADE ST,APPLETON,54911,WI,US,Z004,,
2,2100061672,BAMF HEALTH INC,,109 MICHIGAN ST NW,GRAND RAPIDS,49503-3790,MI,US,Z002,,
3,3000178072,LEE COMMUNITY HEALTHCARE INC,,13279 N CLEVELAND AVE,NORTH FORT MYERS,33903,FL,US,Z004,,
4,4000138780,OHIO DEPARTMENT OF MENTAL HEALTH,,4000 SURFACE RD,COLUMBUS,43228,OH,US,Z003,,
...,...,...,...,...,...,...,...,...,...,...,...
95,7007308103,KAUFMAN PHARMACY,LOTUS RX INC,70000 CARDINAL PL,DUBLIN,43017,OH,US,ZRNI,OWN,
96,4000106028,SOUTHERN CALIFORNIA HOSPITAL AT VAN,,14433 EMELITA ST,VAN NUYS,91401,CA,US,Z003,,
97,2100057682,DIGIRAD IMAG SOL COLONIA HEIGHTS,,445 CHARLES H DIMMOCK PKWY,COLONIAL HEIGHTS,23834-2990,VA,US,Z002,,
98,H00095463,CREEK VALLEY HLTH CLNC,,20 N COLVIN ST 0418,COLORADO CITY,86021-6155,AZ,US,ZP12,OWN,


## Data Analysis

### Helper Functions

Do not change helper functions, these can be run regardless of table

#### Completeness function

#### Uniqueness function

### Critical Data Elements

In [10]:
## list of critical data elements
crt_dt_elem = ['KUNNR','NAME1','STRAS','ORT01','PSTLZ','REGIO','LAND1','KTOKD']

In [11]:
## list of data elements that need to undergoe uniqueness check
nqnss_dt_elem = ['KUNNR']

### Data Metrics

#### Matrix creation

default base matrix to use for data quality scoring equation

In [12]:
child = ut.MatrixConsolidation(table_df, crt_dt_elem, nqnss_dt_elem,"KNA1)

SyntaxError: unterminated string literal (detected at line 1) (1580207853.py, line 1)

##### Missing Data Matrix

In [None]:
child.def_cmplt_df()

##### Uniqueness Data Matrix

In [None]:
child.def_uniq_df()

#### Matrix Consolidation

In [None]:
child.dq_df()

In [None]:
child.dataset_score()

### Data Visualization

In [None]:
## dataset score card
import plotly.graph_objects as go

fig_card = go.Figure(go.Indicator(
    mode = "number",
    value = dataset_score.KNA1_SCORE.iloc[0],
    number = {'suffix': "%"},
    title = {"text": "Dataset Score<br><span style='font-size:0.8em;color:gray'>KNA1</span>"},
    ))

fig_card.update_layout(paper_bgcolor = "lightgray")

fig_card.show()

In [None]:
import plotly.express as px

fig = px.imshow(dq_df,
                labels=dict(x="Column", y="Record Number", color="Score"),
                text_auto=True, 
                zmin=0, 
                zmax=1, 
                color_continuous_scale=["red", "yellow", "green"])

fig.update_layout(
    title='KNA1 Heatmap',
    title_x=0.5)

fig.show()