# Step 2: Cleaning

In [4]:
SELECT TOP 10 * FROM stock_data;

company,date,close,volume,open,high,low
AAPL,2013-07-18,15.42,218632537,15.48,15.53,15.38
AAPL,2013-07-19,15.18,268548901,15.47,15.5,15.16
AAPL,2013-07-22,15.23,207648981,15.34,15.35,15.2
AAPL,2013-07-23,14.96,354477618,15.21,15.25,14.95
AAPL,2013-07-24,15.73,591624923,15.68,15.88,15.55
AAPL,2013-07-25,15.66,229432412,15.74,15.76,15.56
AAPL,2013-07-26,15.75,200082264,15.55,15.75,15.51
AAPL,2013-07-29,15.99,248025441,15.74,16.07,15.72
AAPL,2013-07-30,16.19,308960556,16.07,16.33,16.04
AAPL,2013-07-31,16.16,322499391,16.25,16.33,16.05


## A. Check for NULL values

In [5]:
-- Count the number of NULL values in each column
SELECT 
    SUM(CASE WHEN company IS NULL THEN 1 ELSE 0 END) AS company_nulls,
    SUM(CASE WHEN [date] IS NULL THEN 1 ELSE 0 END) AS date_nulls,
    SUM(CASE WHEN [close] IS NULL THEN 1 ELSE 0 END) AS close_nulls,
    SUM(CASE WHEN volume IS NULL THEN 1 ELSE 0 END) AS volume_nulls,
    SUM(CASE WHEN [open] IS NULL THEN 1 ELSE 0 END) AS open_nulls,
    SUM(CASE WHEN high IS NULL THEN 1 ELSE 0 END) AS high_nulls,
    SUM(CASE WHEN low IS NULL THEN 1 ELSE 0 END) AS low_nulls
FROM stock_data;

-- No NULL values! 

company_nulls,date_nulls,close_nulls,volume_nulls,open_nulls,high_nulls,low_nulls
0,0,0,0,0,0,0


## B. Check for data type consistency

In [6]:
-- Get the data type for each column
SELECT COLUMN_NAME, DATA_TYPE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = 'stock_data';

COLUMN_NAME,DATA_TYPE
company,varchar
date,date
close,float
volume,int
open,float
high,float
low,float


In [7]:
-- Ensure all values in date column are in DATE format
SELECT *
FROM stock_data
WHERE TRY_CAST(date AS DATE) IS NULL;

-- Looks good! 

company,date,close,volume,open,high,low


In [9]:
-- Ensure all values in close, open, high, low columsn are in numeric format
SELECT *
FROM stock_data
WHERE TRY_CAST([close] AS FLOAT) IS NULL
   OR TRY_CAST(volume AS FLOAT) IS NULL
   OR TRY_CAST([open] AS FLOAT) IS NULL
   OR TRY_CAST(high AS FLOAT) IS NULL
   OR TRY_CAST(low AS FLOAT) IS NULL;

-- Looks good! 

company,date,close,volume,open,high,low


## C. Check for outliers

### i. The close column

In [18]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [close] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [close] > (SELECT TOP 1 Upper_Bound FROM Bounds);


company,date,close,volume,open,high,low
META,2020-08-25,280.82,42127240,272.41,283.09,270.26
META,2020-08-26,303.91,69015150,284.0,304.67,284.0
META,2020-08-27,293.22,30301310,300.16,301.23,292.02
META,2020-08-28,293.66,17172420,295.0,297.23,290.98
META,2020-08-31,293.2,17345130,293.95,296.88,291.55
META,2020-09-01,295.44,17320870,294.71,301.49,292.71
META,2020-09-02,302.5,24341370,298.88,303.6,293.05
META,2020-09-03,291.12,32294090,295.99,297.6,283.63
META,2020-09-04,282.73,30333670,287.25,289.0,271.14
META,2020-10-23,284.79,17535160,278.8,285.24,276.82


In [17]:
-- Count the number of outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [close] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [close] > (SELECT TOP 1 Upper_Bound FROM Bounds);

(No column name)
1902


In [20]:
-- List the companies with close values outside the IQR

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[close] < Bounds.Lower_Bound 
   OR s.[close] > Bounds.Upper_Bound;

-- Interesting.  Given that I believe this data to be accurate, I don't believe the values that fell outside the IQR
-- in the close column to be true "outliers" in the sense that they are in any way incorrect or inaccurate.  Instead,
-- the companies that experiecned these "outliers" likely had extreme fluctuations in their stock prices.

company
MSFT
NFLX
TSLA
META


### ii. The volume column

In [21]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY volume) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY volume) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE volume < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR volume > (SELECT TOP 1 Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
AAPL,2016-05-31,24.97,169072160,24.9,25.1,24.71
AAPL,2016-06-02,24.43,160680800,24.4,24.46,24.16
AAPL,2016-06-17,23.83,243173040,24.16,24.16,23.83
AAPL,2016-06-24,23.35,300878840,23.23,23.66,23.16
AAPL,2016-06-27,23.01,182382720,23.25,23.26,22.88
AAPL,2016-06-28,23.4,161551840,23.23,23.42,23.04
AAPL,2016-07-14,24.7,155543920,24.35,24.75,24.33
AAPL,2016-07-25,24.34,161162320,24.56,24.71,24.23
AAPL,2016-07-26,24.17,213818480,24.21,24.49,24.11
AAPL,2016-07-27,25.74,368575680,26.07,26.09,25.69


In [23]:
-- Count the number of outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY volume) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY volume) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE volume < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR volume > (SELECT TOP 1 Upper_Bound FROM Bounds);

(No column name)
1549


In [25]:
-- List the companies with close values outside the IQR

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY volume) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY volume) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.volume < Bounds.Lower_Bound 
   OR s.volume > Bounds.Upper_Bound;

company
MSFT
AAPL
NFLX
QCOM
TSLA
AMD
AMZN
CSCO
META


### iii. The open column

In [26]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [open]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [open]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [open] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [open] > (SELECT TOP 1 Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
META,2020-08-26,303.91,69015150,284.0,304.67,284.0
META,2020-08-27,293.22,30301310,300.16,301.23,292.02
META,2020-08-28,293.66,17172420,295.0,297.23,290.98
META,2020-08-31,293.2,17345130,293.95,296.88,291.55
META,2020-09-01,295.44,17320870,294.71,301.49,292.71
META,2020-09-02,302.5,24341370,298.88,303.6,293.05
META,2021-04-01,298.66,17615980,298.4,302.4,296.6
META,2021-04-05,308.91,28237010,300.89,310.77,300.68
META,2021-04-06,306.26,17335250,308.84,311.35,305.25
META,2021-04-07,313.09,22855240,306.34,314.25,305.5


In [27]:
-- Count the number of outliers 

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [open]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [open]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [open] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [open] > (SELECT TOP 1 Upper_Bound FROM Bounds);

(No column name)
1892


In [28]:
-- List the companies with volume values outside the IQR

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [open]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [open]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[open] < Bounds.Lower_Bound 
   OR s.[open] > Bounds.Upper_Bound;

company
MSFT
NFLX
TSLA
META


### iv. The high column

In [29]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [high]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [high]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [high] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [high] > (SELECT TOP 1 Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
META,2020-08-26,303.91,69015150,284.0,304.67,284.0
META,2020-08-27,293.22,30301310,300.16,301.23,292.02
META,2020-08-28,293.66,17172420,295.0,297.23,290.98
META,2020-08-31,293.2,17345130,293.95,296.88,291.55
META,2020-09-01,295.44,17320870,294.71,301.49,292.71
META,2020-09-02,302.5,24341370,298.88,303.6,293.05
META,2021-04-01,298.66,17615980,298.4,302.4,296.6
META,2021-04-05,308.91,28237010,300.89,310.77,300.68
META,2021-04-06,306.26,17335250,308.84,311.35,305.25
META,2021-04-07,313.09,22855240,306.34,314.25,305.5


In [30]:
-- Count the number of outliers 

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [high]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [high]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [high] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [open] > (SELECT TOP 1 Upper_Bound FROM Bounds);

(No column name)
1895


In [31]:
-- List the companies with high values outside the IQR

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [high]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [high]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[high] < Bounds.Lower_Bound 
   OR s.[high] > Bounds.Upper_Bound;

company
MSFT
NFLX
TSLA
META


### v. The low column

In [32]:
-- Return all rows with outliers

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [low]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [low]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [low] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [low] > (SELECT TOP 1 Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
META,2020-08-26,303.91,69015150,284.0,304.67,284.0
META,2020-08-27,293.22,30301310,300.16,301.23,292.02
META,2020-08-28,293.66,17172420,295.0,297.23,290.98
META,2020-08-31,293.2,17345130,293.95,296.88,291.55
META,2020-09-01,295.44,17320870,294.71,301.49,292.71
META,2020-09-02,302.5,24341370,298.88,303.6,293.05
META,2021-04-01,298.66,17615980,298.4,302.4,296.6
META,2021-04-05,308.91,28237010,300.89,310.77,300.68
META,2021-04-06,306.26,17335250,308.84,311.35,305.25
META,2021-04-07,313.09,22855240,306.34,314.25,305.5


In [33]:
-- Count the number of outliers 

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [low]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [low]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [low] < (SELECT TOP 1 Lower_Bound FROM Bounds)
   OR [low] > (SELECT TOP 1 Upper_Bound FROM Bounds);

(No column name)
1873


In [34]:
-- List the companies with low values outside the IQR

WITH Percentiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [low]) OVER () AS Q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [low]) OVER () AS Q3
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 - Q1) AS IQR,
        (Q1 - 1.5 * (Q3 - Q1)) AS Lower_Bound,
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[low] < Bounds.Lower_Bound 
   OR s.[low] > Bounds.Upper_Bound;

company
MSFT
NFLX
TSLA
META


So, it appears that Microsoft (MSFT), Netflix (NFLX), Tesla (TSLA), and Meta (META) had the most outliers in open, close, high, and low.   I just realized something, though: What is my outlier calculation actually telling me about high and low... it's not telling me just if companies had a higher high or a lower low than average... it's telling me if they had a higher OR lower high than expected or a lower OR higher low than expected, right?

### vi. Revisting the close column

In [5]:
-- Return all rows with outliers above the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [close] > (SELECT Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
META,2020-08-25,280.82,42127240,272.41,283.09,270.26
META,2020-08-26,303.91,69015150,284.0,304.67,284.0
META,2020-08-27,293.22,30301310,300.16,301.23,292.02
META,2020-08-28,293.66,17172420,295.0,297.23,290.98
META,2020-08-31,293.2,17345130,293.95,296.88,291.55
META,2020-09-01,295.44,17320870,294.71,301.49,292.71
META,2020-09-02,302.5,24341370,298.88,303.6,293.05
META,2020-09-03,291.12,32294090,295.99,297.6,283.63
META,2020-09-04,282.73,30333670,287.25,289.0,271.14
META,2020-10-23,284.79,17535160,278.8,285.24,276.82


In [6]:
-- Count the number of outliers above the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [close] > (SELECT Upper_Bound FROM Bounds);

(No column name)
1902


In [8]:
-- List the companies with close values outside the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [close]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [close]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[close] > Bounds.Upper_Bound;

 -- Interestingly, there was no change when calculating only outliers above the upper bound vs. outliers both 
 -- above the upper bound and below the lower bound of the close column.  I imagine this might be true of the 
 -- volume, open, high, and low columns, as well, but perhaps I should check.

company
META
MSFT
NFLX
TSLA


### vii. Revisiting the volume column

In [9]:
-- Return all rows with outliers above the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT *
FROM stock_data
WHERE [volume] > (SELECT Upper_Bound FROM Bounds);

company,date,close,volume,open,high,low
AAPL,2013-07-18,15.42,218632537,15.48,15.53,15.38
AAPL,2013-07-19,15.18,268548901,15.47,15.5,15.16
AAPL,2013-07-22,15.23,207648981,15.34,15.35,15.2
AAPL,2013-07-23,14.96,354477618,15.21,15.25,14.95
AAPL,2013-07-24,15.73,591624923,15.68,15.88,15.55
AAPL,2013-07-25,15.66,229432412,15.74,15.76,15.56
AAPL,2013-07-26,15.75,200082264,15.55,15.75,15.51
AAPL,2013-07-29,15.99,248025441,15.74,16.07,15.72
AAPL,2013-07-30,16.19,308960556,16.07,16.33,16.04
AAPL,2013-07-31,16.16,322499391,16.25,16.33,16.05


In [10]:
-- Count the number of outliers above the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT COUNT(*)
FROM stock_data
WHERE [volume] > (SELECT Upper_Bound FROM Bounds);

(No column name)
1549


In [11]:
-- List the companies with volume values outside the upper bound

WITH Percentiles AS (
    SELECT TOP 1
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q3,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY [volume]) OVER () AS Q1
    FROM stock_data
),
Bounds AS (
    SELECT 
        (Q3 + 1.5 * (Q3 - Q1)) AS Upper_Bound
    FROM Percentiles
)
SELECT DISTINCT s.company
FROM stock_data s
CROSS JOIN Bounds
WHERE s.[volume] > Bounds.Upper_Bound;

-- Same results for volume above 3rd quartile as outside IQR (above 3rd quarttile + below 1st quartile).
-- I gues all the volume value "outliers" were high outliers, just like close value outliers. 

company
AAPL
AMD
AMZN
CSCO
META
MSFT
NFLX
QCOM
TSLA
