# Step 1: Preparation

## For this analysis, I am using the [Stock Market: Historical Data of Top 10 Companies](https://www.kaggle.com/datasets/khushipitroda/stock-market-historical-data-of-top-10-companies) dataset from Kaggle.

In [4]:
CREATE TABLE stock_data (
    company VARCHAR(10),
    [date] DATE,         -- Represents the trading day
    [close] FLOAT,       -- The closing or last traded price
    volume INT,          -- Total number of shares traded
    [open] FLOAT,        -- The opening price
    [high] FLOAT,        -- The highest price during the trading session
    [low] FLOAT          -- The lowest price during the trading session
);

In [5]:
BULK INSERT stock_data
FROM '/var/opt/mssql/data/data.csv'
WITH (
    FIELDTERMINATOR = ',',  -- Delimiter between columns
    ROWTERMINATOR = '\n',   -- Delimiter between rows
    FIRSTROW = 2           -- Skips the header row in the CSV
);

In [8]:
SELECT TOP 10 * FROM stock_data; -- Check to make sure everything's in place

-- Looks good! 

company,date,close,volume,open,high,low
AAPL,2023-07-17,193.99,50520160,191.9,194.32,191.81
AAPL,2023-07-14,190.69,41616240,190.23,191.18,189.63
AAPL,2023-07-13,190.54,41342340,190.5,191.19,189.78
AAPL,2023-07-12,189.77,60750250,189.68,191.7,188.47
AAPL,2023-07-11,188.08,46638120,189.16,189.3,186.6
AAPL,2023-07-10,188.61,59922160,189.26,189.99,187.04
AAPL,2023-07-07,190.68,46815000,191.41,192.67,190.24
AAPL,2023-07-06,191.81,45156010,189.84,192.02,189.2
AAPL,2023-07-05,191.33,46920260,191.57,192.98,190.62
AAPL,2023-07-03,192.46,31346600,193.78,193.88,191.76


In [9]:
EXEC sp_help 'stock_data'; -- Check data types of each column

-- It appears I neglected to set a PRIMARY KEY in my CREATE TABLE 

Name,Owner,Type,Created_datetime
stock_data,dbo,user table,2025-02-22 00:35:27.807


Column_name,Type,Computed,Length,Prec,Scale,Nullable,TrimTrailingBlanks,FixedLenNullInSource,Collation
company,varchar,no,10,,,yes,no,yes,SQL_Latin1_General_CP1_CI_AS
date,date,no,3,10.0,0.0,yes,(n/a),(n/a),
close,float,no,8,53.0,,yes,(n/a),(n/a),
volume,int,no,4,10.0,0.0,yes,(n/a),(n/a),
open,float,no,8,53.0,,yes,(n/a),(n/a),
high,float,no,8,53.0,,yes,(n/a),(n/a),
low,float,no,8,53.0,,yes,(n/a),(n/a),


Identity,Seed,Increment,Not For Replication
No identity column defined.,,,


RowGuidCol
No rowguidcol column defined.


Data_located_on_filegroup
PRIMARY


In [10]:
-- Check for duplicates 
SELECT company, date, COUNT(*) AS duplicate_count
FROM stock_data
GROUP BY company, date
HAVING COUNT(*) > 1;

-- No duplicates.  Nice! 

company,date,duplicate_count


In [19]:
-- Make the company and date columns not nullable
ALTER TABLE stock_data
ALTER COLUMN company VARCHAR(50) NOT NULL;

ALTER TABLE stock_data
ALTER COLUMN date DATE NOT NULL;

In [17]:
-- Add a PRIMARY KEY on each unique combination of company and date
ALTER TABLE stock_data
ADD CONSTRAINT pk_stock_data PRIMARY KEY (company, date);