In [25]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import SQLTransformer

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'sqlOptions'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [27]:
# load file
fil = '../data/rec-crime-pfa.csv'
schem = StructType([StructField('12 months ending', DateType()),
                     StructField('PFA', StringType()),
                     StructField('Region', StringType()),
                     StructField('Offence', StringType()),
                     StructField('Rolling year total number of offences', IntegerType())])

crime = spark.read.format('csv').options(header=True, dateFormat='dd/MM/yyyy').schema(schem).load(fil)\
    .withColumnRenamed('12 months ending', 'YearEnding').withColumnRenamed('Rolling year total number of offences', 'OffenceCount')\
    .cache()

display(crime.limit(5).toPandas())

Unnamed: 0,YearEnding,PFA,Region,Offence,OffenceCount
0,2003-03-31,Avon and Somerset,South West,All other theft offences,25959
1,2003-03-31,Avon and Somerset,South West,Bicycle theft,3090
2,2003-03-31,Avon and Somerset,South West,Criminal damage and arson,26202
3,2003-03-31,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,2003-03-31,Avon and Somerset,South West,Domestic burglary,14561


In [17]:
# some counts by year
cnts = crime.select(year('YearEnding').alias('year'), 'OffenceCount')\
    .groupby('year').agg(count('year').alias('offence_type_count'), sum('OffenceCount').alias('offence_count')).orderBy('year')
cnts.show()

+----+------------------+-------------+
|year|offence_type_count|offence_count|
+----+------------------+-------------+
|2003|               880|      5974960|
|2004|               880|      6013759|
|2005|               880|      5637511|
|2006|               880|      5555172|
|2007|              3520|     21025549|
|2008|              3520|     19449338|
|2009|              3520|     18359035|
|2010|              3520|     16989262|
|2011|              3529|     16932850|
|2012|              3532|     16985977|
|2013|              3532|     16128181|
|2014|              3444|     16310143|
|2015|              3708|     17169617|
|2016|              3708|     18577143|
|2017|              3708|     20927910|
|2018|              3708|     22684521|
+----+------------------+-------------+



In [18]:
# make a temp view from this data
crime.createOrReplaceTempView('tempCrime')

In [21]:
# attempt to replicate above query
res = spark.sql("select year(YearEnding) as YR, count(*) as CNT, sum(OffenceCount) as TOT from tempCrime group by year(YearEnding) order by year(YearEnding);")
res.show()

+----+----+--------+
|  YR| CNT|     TOT|
+----+----+--------+
|2003| 880| 5974960|
|2004| 880| 6013759|
|2005| 880| 5637511|
|2006| 880| 5555172|
|2007|3520|21025549|
|2008|3520|19449338|
|2009|3520|18359035|
|2010|3520|16989262|
|2011|3529|16932850|
|2012|3532|16985977|
|2013|3532|16128181|
|2014|3444|16310143|
|2015|3708|17169617|
|2016|3708|18577143|
|2017|3708|20927910|
|2018|3708|22684521|
+----+----+--------+



In [24]:
spark.sql('select distinct Region from tempCrime order by Region;').show(truncate=False)

+------------------------+
|Region                  |
+------------------------+
|British Transport Police|
|East                    |
|East Midlands           |
|Fraud: Action Fraud     |
|Fraud: CIFAS            |
|Fraud: UK Finance       |
|London                  |
|North East              |
|North West              |
|South East              |
|South West              |
|Wales                   |
|West Midlands           |
|Yorkshire and The Humber|
+------------------------+



In [28]:
# try using sqltransformer
sqlTrans = SQLTransformer(statement='select distinct Region from __THIS__ order by Region;')
sqlTrans.transform(crime).show()

+--------------------+
|              Region|
+--------------------+
|British Transport...|
|                East|
|       East Midlands|
| Fraud: Action Fraud|
|        Fraud: CIFAS|
|   Fraud: UK Finance|
|              London|
|          North East|
|          North West|
|          South East|
|          South West|
|               Wales|
|       West Midlands|
|Yorkshire and The...|
+--------------------+



In [29]:
# use spark sql expr - this seems almost useless
crime.withColumn('DailyRate', expr('OffenceCount/365')).select('YearEnding', 'Region', 'OffenceCount', 'DailyRate').show(20, truncate=False)
#crime.selectExpr # could also use this

+----------+----------+------------+--------------------+
|YearEnding|Region    |OffenceCount|DailyRate           |
+----------+----------+------------+--------------------+
|2003-03-31|South West|25959       |71.12054794520547   |
|2003-03-31|South West|3090        |8.465753424657533   |
|2003-03-31|South West|26202       |71.78630136986301   |
|2003-03-31|South West|2           |0.005479452054794521|
|2003-03-31|South West|14561       |39.893150684931506  |
|2003-03-31|South West|2308        |6.323287671232877   |
|2003-03-31|South West|5339        |14.627397260273973  |
|2003-03-31|South West|19          |0.052054794520547946|
|2003-03-31|South West|1597        |4.375342465753425   |
|2003-03-31|South West|15621       |42.797260273972604  |
|2003-03-31|South West|735         |2.0136986301369864  |
|2003-03-31|South West|4025        |11.027397260273972  |
|2003-03-31|South West|3504        |9.6                 |
|2003-03-31|South West|1737        |4.758904109589041   |
|2003-03-31|So

In [30]:
# same as above but without the implied string parsing
crime.select('YearEnding', 'Region', 'OffenceCount').withColumn('DailyRate', col('OffenceCount')/365).show(truncate=False)

+----------+----------+------------+--------------------+
|YearEnding|Region    |OffenceCount|DailyRate           |
+----------+----------+------------+--------------------+
|2003-03-31|South West|25959       |71.12054794520547   |
|2003-03-31|South West|3090        |8.465753424657533   |
|2003-03-31|South West|26202       |71.78630136986301   |
|2003-03-31|South West|2           |0.005479452054794521|
|2003-03-31|South West|14561       |39.893150684931506  |
|2003-03-31|South West|2308        |6.323287671232877   |
|2003-03-31|South West|5339        |14.627397260273973  |
|2003-03-31|South West|19          |0.052054794520547946|
|2003-03-31|South West|1597        |4.375342465753425   |
|2003-03-31|South West|15621       |42.797260273972604  |
|2003-03-31|South West|735         |2.0136986301369864  |
|2003-03-31|South West|4025        |11.027397260273972  |
|2003-03-31|South West|3504        |9.6                 |
|2003-03-31|South West|1737        |4.758904109589041   |
|2003-03-31|So

In [None]:
sc.stop()