# What size and shape are the statistics.gov.scot datasets?


In [3]:
; Add code libraries. 

%classpath add mvn org.clojure data.csv 1.0.0
(require '[clojure.data.csv :as csv])

%classpath add mvn clj-http clj-http 3.10.1
(require '[clj-http.client :as http])

(require '[clojure.string :as str])

(import 'java.net.URLEncoder)
(import 'java.time.LocalDate)
(import 'com.twosigma.beakerx.chart.xychart.TimePlot
        'com.twosigma.beakerx.chart.xychart.plotitem.Line)

class com.twosigma.beakerx.chart.xychart.plotitem.Line

In [4]:
; Define convenience functions.

; Convert the CSV structure to a list-of-maps structure.
(defn to-maps [csv-data]
    (map zipmap (->> (first csv-data)
                    (map keyword)
                    repeat)
                (rest csv-data)))

; Ask statistic.gov.scot to execute the given SPARQL query.
(defn exec-query [sparql]
    (->> (http/post "http://statistics.gov.scot/sparql" 
                    {:body (str "query=" (URLEncoder/encode sparql)) 
                    :headers {"Accept" "text/csv" 
                              "Content-Type" "application/x-www-form-urlencoded"} 
                    :debug false})
        :body
        csv/read-csv
        to-maps))

#'beaker_clojure_shell_8e6f19ea-d4d9-4642-b450-d99bf934d4bf/exec-query

In [5]:
; Query for the size (i.e. number of observations) per datset of interest.

(def sparql "

PREFIX qb: <http://purl.org/linked-data/cube#>

SELECT (COUNT(DISTINCT ?obs) AS ?obs_count) WHERE {
  
    BIND(<http://statistics.gov.scot/data/dataset-name> AS ?dataset)
  
    ?obs qb:dataSet ?dataset .
}
")

(doseq [dataset-name ["household-waste" 
                      #_"population-estimates-current-geographic-boundaries"]] ; commented-out to reduce the number of times we count its ~11 million observations
    (as-> dataset-name v
         (str/replace sparql "dataset-name" v) 
         (exec-query v) 
         (first v)
         (:obs_count v)
         (println v dataset-name "observations")))

37752 household-waste observations


null

In [10]:
; Query for the shape (i.e. dimensions with their cardinalities, and measures) of the household-waste dataset.

(def sparql "

PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?label ?type_label ?component (COUNT(?compvalue) AS ?cardinality) WHERE {
  
    BIND(<http://statistics.gov.scot/data/household-waste> AS ?dataset)
  
    ?dataset qb:structure/qb:component ?compspec .

    VALUES ?comptype { qb:dimension qb:measure }
    ?compspec ?comptype ?component .
    OPTIONAL { ?compspec qb:codeList/skos:member ?compvalue }
  
    ?component rdfs:label ?label .
    ?comptype rdfs:label ?type_label .
} GROUP BY ?label ?type_label ?component ORDER BY DESC(?cardinality)
")

(exec-query sparql)