# Assignment 6

First we have to create some SQL tables.

In [2]:
CREATE DATABASE MouseHypothalamus;

: Msg 1801, Level 16, State 3, Line 1
Database 'MouseHypothalamus' already exists. Choose a different database name.

In [1]:
USE MouseHypothalamus;
DROP TABLE IF EXISTS #TempMolecules;
CREATE TABLE #TempMolecules(
    Gene_name NVARCHAR(10) NOT NULL,
    Cell_name NVARCHAR(40),
    Animal_ID int,
    Bregma float,
    Animal_sex nvarchar(6) NOT NULL,
    Behavior nvarchar(20),
    Centroid_X float NOT NULL,
    Centroid_Y float NOT NULL,
    Centroid_Z float NOT NULL,
    Total_brightness float,
    Area int,
    Error_bit tinyint,
    Error_direction tinyint
);

BULK INSERT #TempMolecules FROM '/var/data/merfish_barcodes.csv'
WITH ( 
    FIRSTROW = 2, -- skip the column headers
    --ROWS_PER_BATCH = 467052741, -- however many total rows the data has
    FIELDTERMINATOR = ',', 
    ROWTERMINATOR = '0x0a',
    KEEPNULLS
);

SELECT IDENTITY(int,1,1) as id,* INTO Molecules FROM #TempMolecules;

In [3]:
Use MouseHypothalamus;
SELECT IDENTITY(int,1,1) as id,* INTO Molecules FROM #TempMolecules;

: Msg 208, Level 16, State 0, Line 2
Invalid object name '#TempMolecules'.

# Geometries

Let's start by creating the geometry objects for the molecules.

In [9]:
Use MouseHypothalamus;

DROP TABLE IF EXISTS MoleculesWithPoints;
SELECT 
        id, 
        Gene_name, 
        Cell_name, 
        Animal_ID, 
        Bregma, 
        Centroid_Z,
        geometry::STGeomFromText('POINT(' + CONVERT(VARCHAR(20), Centroid_X) + ' ' + CONVERT(VARCHAR(20), Centroid_Y) + ')', 0) AS point
    INTO MoleculesWithPoints 
    FROM Molecules;


We also need to add a primary key constraint.

In [None]:
ALTER TABLE MoleculesWithPoints
ADD CONSTRAINT moleculesWithPoints_id_PK PRIMARY KEY(id);

Now let's add a spatial index. To do so, we will first find the minimum and maximum coordinate in order to set a proper bounding box for our spatial index.

In [20]:
Use MouseHypothalamus;
SELECT Max(Centroid_X) as maxx, Max(Centroid_Y) as maxy,  Min(Centroid_X) as minx, Min(Centroid_Y) as miny FROM Molecules;

maxx,maxy,minx,miny
4203.399,4968.7,-4289.699,-5009.986


In [3]:
Use MouseHypothalamus;
CREATE SPATIAL INDEX MoleculesWithPointsInd ON
   [MouseHypothalamus].[dbo].[MoleculesWithPoints](point)
   WITH (GRIDS = (HIGH, HIGH, HIGH, HIGH), 
        BOUNDING_BOX = (XMIN = -4300,YMIN = -5100, XMAX = 4300, YMAX = 5000)); -- approx based on the actual min/max values in the data

Now we'll create POLYGON spatial types using the CellBoundariesWithGeometryStrings data.

In [6]:
Use MouseHypothalamus;

DROP TABLE IF EXISTS CellBoundariesWithPolygons;
SELECT 
        id, 
        layer,
        feature_uid,
        feature_id,
        geometry::STGeomFromText(REPLACE(geometry_string, '"', ''),0) AS polygon
    INTO CellBoundariesWithPolygons
    FROM CellBoundariesWithGeometryStrings; 

Like before, this table also needs a primary key constraint and a spatial index.

In [None]:
ALTER TABLE CellBoundariesWithPolygons
ADD CONSTRAINT cellBoundariesWithPolygons_id_PK PRIMARY KEY(id);

In [4]:
Use MouseHypothalamus;
CREATE SPATIAL INDEX CellBoundariesWithPolygonsHeadInd ON
   [MouseHypothalamus].[dbo].[CellBoundariesWithPolygonsHead](polygon)
   WITH (GRIDS = (HIGH, HIGH, HIGH, HIGH), 
        BOUNDING_BOX = (XMIN = -6000,YMIN = -6000, XMAX = 6000, YMAX = 6000)); -- rounding up to +- 6000

# Molecule count table

Now, we can count how many molecules there are for each molecule type. We'll start by fixing our cell polygons using ST.MakeValid():

In [5]:
USE MouseHypothalamus;
UPDATE CellBoundariesWithPolygonsHead
    SET polygon = polygon.MakeValid();

Now everything is ready to go for spatial queries on each layer!

In [None]:
DROP TABLE IF EXISTS querytime;
CREATE TABLE querytime (time DATETIME);

In [3]:
-- Set variables to track the time 
DECLARE @startTime DATETIME
declare @endTime DATETIME
declare @diff DATETIME


Set @starttime = getdate()

-- Find molecules in each cell for one layer
-- Similar to ileum, except different column names and added animal_id, bregma checks
USE MouseHypothalamus;
DROP TABLE IF EXISTS MoleculeCountsLayer3;
SELECT mol.Gene_name, COUNT(mol.id) as molecule_count, poly.id, 3 as z_layer INTO MoleculeCountsLayer3 FROM (
    SELECT * FROM [MouseHypothalamus].[dbo].[MoleculesWithPoints] 
        WHERE Centroid_Z=3 and Animal_ID=1 and Bregma=0.21) as mol -- looking for layer 0, animal 4, bregma -0.14 -- also layer 3, animal 1, bregma 0.21
    INNER JOIN (   
        SELECT * FROM [MouseHypothalamus].[dbo].[CellBoundariesWithPolygons] 
        WHERE layer=2 ) as poly   -- layer = 0 for Layer1, layer = 2 for Layer3
    ON poly.polygon.STIntersects(mol.point) = 1 -- =1 is needed bc output is 0 or 1 instead of T/F
    GROUP BY mol.Gene_name , poly.id;

-- Set the ending time
Set @endTime = GETDATE()
Set @diff = @endTime - @startTime

INSERT INTO querytime VALUES (@diff)
GO 100

SELECT *
FROM querytime


: Msg 207, Level 16, State 1, Line 11
Invalid column name 'z_layer'.