In [0]:
-- =========================================================
-- This script used for data quality step and Transform the data to be more suitable.
-- =========================================================

-- ====================
-- crm_cust_info Table
-- ====================
-- Detect the Duplicates and Nulls from Primary_key (cust_id)
SELECT cst_id, count(1) as cnt
FROM datawarehouse.silver.crm_cust_info
GROUP by cst_id
having cnt > 1;


-- Check the Sapces (Unwanted)
select cst_firstname, cst_lastname from datawarehouse.silver.crm_cust_info 
where trim(cst_firstname) != cst_firstname or trim(cst_lastname) != cst_lastname;


-- Cehck distinct values in gender
select distinct cst_gndr from datawarehouse.silver.crm_cust_info;

-- Check distinct values in the marital status
select distinct cst_marital_status from datawarehouse.silver.crm_cust_info;




-- =========================================
-- crm_prd_info Table
-- =========================================
-- Check the Duplicates and Nulls from Primary_key (prd_id)
-- Expectation: No Result
select prd_id, count(1)
from datawarehouse.silver.crm_prd_info
group by prd_id
having count(1) > 1 or prd_id is null;


-- Check for unwanted spaces
select 
prd_nm
from datawarehouse.silver.crm_prd_info
where trim(prd_nm) != prd_nm;


-- check nulls or negative numbers
select 
prd_cost
from datawarehouse.silver.crm_prd_info
where prd_cost is null or prd_cost < 0;


-- Data Standarizatoin and Consistency
select distinct prd_line 
from datawarehouse.silver.crm_prd_info;

-- check if end_dt < start_dt
select prd_end_dt, prd_start_dt from datawarehouse.silver.crm_prd_info
where prd_end_dt < prd_start_dt;




-- =========================================
-- crm_sales_details Table
-- =========================================
-- check the sls_ord_num if it has nulls and unwanted spaces
select 
sls_ord_num
from datawarehouse.silver.crm_sales_details
where sls_ord_num is null or sls_ord_num != trim(sls_ord_num);

-- check if sls_cust_id has null values
select 
sls_cust_id
from datawarehouse.silver.crm_sales_details
where sls_cust_id is null or sls_cust_id != trim(sls_cust_id);

-- check the sls_order_dt if it has nulls or zero dates
select 
sls_order_dt
-- try_to_date(sls_order_dt, 'yyyyMMdd') as sls_order_dt
from datawarehouse.silver.crm_sales_details where
where sls_order_dt is null 
or len(sls_order_dt) != 10 ;

-- Check if order_dt higer than ship date
select 
  sls_order_dt, sls_ship_dt
from datawarehouse.silver.crm_sales_details
where try_to_date(sls_order_dt, 'yyyyMMdd') > try_to_date(sls_ship_dt, 'yyyyMMdd') 
  or try_to_date(sls_order_dt, 'yyyyMMdd') > try_to_date(sls_due_dt, 'yyyyMMdd')
  or try_to_date(sls_ship_dt, 'yyyyMMdd') > try_to_date(sls_due_dt, 'yyyyMMdd');


-- check sls_sales, sls_quantity, sls_price is not null or zero or negative
select 
sls_sales, sls_quantity, sls_price
from datawarehouse.silver.crm_sales_details
where sls_sales is null or sls_sales <= 0 
  or sls_quantity is null or sls_quantity <= 0 
  or sls_price is null or sls_price <= 0
order by sls_sales, sls_quantity, sls_price;




-- =========================================
-- erp_cust_az12 Table
-- =========================================
-- Check the cid column, remove NAS from begining of it
select distinct * 
from datawarehouse.silver.erp_cust_az12
where cid not in (select cst_key from datawarehouse.silver.crm_cust_info);


-- Check the bdate column if has nulls for invalid dates
select distinct bdate
from datawarehouse.silver.erp_cust_az12
where bdate is null or bdate != trim(bdate) or len(bdate) != 10 or bdate > current_date();

-- Check the gen column if has misleading values
select distinct gen as gen_old,
CASE
    WHEN upper(trim(gen)) IN ('F', 'FEMALE') THEN 'Female'
    WHEN upper(trim(gen)) IN ('M', 'MALE') THEN 'Male'
    ELSE 'n/a'
  END AS gen
from datawarehouse.silver.erp_cust_az12;




-- =========================================
-- erp_cust_az12 Table
-- =========================================
-- Check valid values between cid and cst_key in cust_info table
select distinct
CID as cid_old,
replace(cid, '-', '') as cid
from datawarehouse.silver.erp_loc_a101
where replace(cid, '-', '') not in (select cst_key from datawarehouse.silver.crm_cust_info);



-- Check country abbreviation
select 
distinct CNTRY as cntry_old,
case 
  when upper(trim(cntry)) in ('US', 'USA', 'UNITED STATES') then 'USA'
  when upper(trim(cntry)) in ('CA', 'CANADA') then 'Canada'
  when upper(trim(cntry)) in ('DE', 'GERMANY') then 'Germany'
  when upper(trim(cntry)) in ('AU', 'AUSTRALIA') then 'Australia'
  when upper(trim(cntry)) in ('FR', 'FRANCE') then 'France'
  when upper(trim(cntry)) in ('UK', 'UNITED KINGDOM') then 'United Kingdom'
  else 'n/a'
end as cntry
from datawarehouse.silver.erp_loc_a101;
