# Step 2: Cleaning

In [4]:
SELECT TOP 10 * FROM stock_data;

company,date,close,volume,open,high,low
AAPL,2013-07-18,15.42,218632537,15.48,15.53,15.38
AAPL,2013-07-19,15.18,268548901,15.47,15.5,15.16
AAPL,2013-07-22,15.23,207648981,15.34,15.35,15.2
AAPL,2013-07-23,14.96,354477618,15.21,15.25,14.95
AAPL,2013-07-24,15.73,591624923,15.68,15.88,15.55
AAPL,2013-07-25,15.66,229432412,15.74,15.76,15.56
AAPL,2013-07-26,15.75,200082264,15.55,15.75,15.51
AAPL,2013-07-29,15.99,248025441,15.74,16.07,15.72
AAPL,2013-07-30,16.19,308960556,16.07,16.33,16.04
AAPL,2013-07-31,16.16,322499391,16.25,16.33,16.05


## A. Check for NULL values

In [5]:
-- Count the number of NULL values in each column
SELECT 
    SUM(CASE WHEN company IS NULL THEN 1 ELSE 0 END) AS company_nulls,
    SUM(CASE WHEN [date] IS NULL THEN 1 ELSE 0 END) AS date_nulls,
    SUM(CASE WHEN [close] IS NULL THEN 1 ELSE 0 END) AS close_nulls,
    SUM(CASE WHEN volume IS NULL THEN 1 ELSE 0 END) AS volume_nulls,
    SUM(CASE WHEN [open] IS NULL THEN 1 ELSE 0 END) AS open_nulls,
    SUM(CASE WHEN high IS NULL THEN 1 ELSE 0 END) AS high_nulls,
    SUM(CASE WHEN low IS NULL THEN 1 ELSE 0 END) AS low_nulls
FROM stock_data;

-- No NULL values! 

company_nulls,date_nulls,close_nulls,volume_nulls,open_nulls,high_nulls,low_nulls
0,0,0,0,0,0,0


## B. Check for data type consistency

In [6]:
-- Get the data type for each column
SELECT COLUMN_NAME, DATA_TYPE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = 'stock_data';

COLUMN_NAME,DATA_TYPE
company,varchar
date,date
close,float
volume,int
open,float
high,float
low,float


In [7]:
-- Ensure all values in date column are in DATE format
SELECT *
FROM stock_data
WHERE TRY_CAST(date AS DATE) IS NULL;

-- Looks good! 

company,date,close,volume,open,high,low


In [9]:
-- Ensure all values in close, open, high, low columsn are in numeric format
SELECT *
FROM stock_data
WHERE TRY_CAST([close] AS FLOAT) IS NULL
   OR TRY_CAST(volume AS FLOAT) IS NULL
   OR TRY_CAST([open] AS FLOAT) IS NULL
   OR TRY_CAST(high AS FLOAT) IS NULL
   OR TRY_CAST(low AS FLOAT) IS NULL;

-- Looks good! 

company,date,close,volume,open,high,low


## C. Check for outliers

### i. In the close column

In [13]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data, Bounds
WHERE [close] < Lower_Bound OR [close] > Upper_Bound;

company,date,close,volume,open,high,low,IQR,Lower_Bound,Upper_Bound
META,2020-08-25,280.82,42127240,272.41,283.09,270.26,97.6675,-109.93374999999996,280.7362499999999
META,2020-08-26,303.91,69015150,284.0,304.67,284.0,97.6675,-109.93374999999996,280.7362499999999
META,2020-08-27,293.22,30301310,300.16,301.23,292.02,97.6675,-109.93374999999996,280.7362499999999
META,2020-08-28,293.66,17172420,295.0,297.23,290.98,97.6675,-109.93374999999996,280.7362499999999
META,2020-08-31,293.2,17345130,293.95,296.88,291.55,97.6675,-109.93374999999996,280.7362499999999
META,2020-09-01,295.44,17320870,294.71,301.49,292.71,97.6675,-109.93374999999996,280.7362499999999
META,2020-09-02,302.5,24341370,298.88,303.6,293.05,97.6675,-109.93374999999996,280.7362499999999
META,2020-09-03,291.12,32294090,295.99,297.6,283.63,97.6675,-109.93374999999996,280.7362499999999
META,2020-09-04,282.73,30333670,287.25,289.0,271.14,97.6675,-109.93374999999996,280.7362499999999
META,2020-10-23,284.79,17535160,278.8,285.24,276.82,97.6675,-109.93374999999996,280.7362499999999


In [14]:
-- Count the number of outliers

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data, Bounds
WHERE [close] < Lower_Bound OR [close] > Upper_Bound;

(No column name)
47854320
