In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("Lodon Crime dataset samples")\
                    .getOrCreate()

In [5]:
data = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/london_crime_by_lsoa-100k.csv")

In [6]:
data.printSchema()

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [51]:
# Clean the data frame to improve performance later
data = data.dropna()
data = data.drop("lsoa_code")

In [52]:
data.count()

99999

In [8]:
data.limit(5).show()

+---------+----------+--------------------+--------------------+-----+----+-----+
|lsoa_code|   borough|      major_category|      minor_category|value|year|month|
+---------+----------+--------------------+--------------------+-----+----+-----+
|E01001116|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+---------+----------+--------------------+--------------------+-----+----+-----+



In [9]:
data.select("borough").distinct().count()

33

In [10]:
data.filter(data["borough"] == "Hackney").show()

+---------+-------+--------------------+--------------------+-----+----+-----+
|lsoa_code|borough|      major_category|      minor_category|value|year|month|
+---------+-------+--------------------+--------------------+-----+----+-----+
|E01001786|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|E01001794|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|E01001787|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|E01001738|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|E01001807|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
|E01001733|Hackney|            Burglary|Burglary in a Dwe...|    2|2008|    5|
|E01001806|Hackney|             Robbery|   Business Property|    0|2016|    7|
|E01001734|Hackney|  Theft and Handling|Theft/Taking of P...|    0|2009|   12|
|E01001750|Hackney|               Drugs|    Drug Trafficking|    0|2014|    4|
|E01001828|Hackney|  Theft and Handling|Handling Sto

In [12]:
data.filter(data["year"].isin(["2015", "2016"])).show(5)

+---------+---------+--------------------+--------------------+-----+----+-----+
|lsoa_code|  borough|      major_category|      minor_category|value|year|month|
+---------+---------+--------------------+--------------------+-----+----+-----+
|E01001116|  Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646|Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|  Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774|Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004177|   Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
+---------+---------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [13]:
data.filter(data["year"].isin(["2015", "2016"])).sample(fraction=0.001).count()

15

In [14]:
data.filter(data["year"] >= 2014).show(5)

+---------+---------+--------------------+--------------------+-----+----+-----+
|lsoa_code|  borough|      major_category|      minor_category|value|year|month|
+---------+---------+--------------------+--------------------+-----+----+-----+
|E01001116|  Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646|Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|  Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774|Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004177|   Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
+---------+---------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [15]:
data.filter(data["year"] >= 2018).show(5)

+---------+-------+--------------+--------------+-----+----+-----+
|lsoa_code|borough|major_category|minor_category|value|year|month|
+---------+-------+--------------+--------------+-----+----+-----+
+---------+-------+--------------+--------------+-----+----+-----+



In [16]:
data.groupBy("borough").count().count()

33

In [18]:
data.groupBy("borough").count().show(5)

+--------------------+-----+
|             borough|count|
+--------------------+-----+
|             Croydon| 4344|
|          Wandsworth| 3789|
|              Bexley| 2832|
|             Lambeth| 3841|
|Barking and Dagenham| 2396|
+--------------------+-----+
only showing top 5 rows



In [22]:
data.groupBy("borough").count().show(5)

+--------------------+-----+
|             borough|count|
+--------------------+-----+
|             Croydon| 4344|
|          Wandsworth| 3789|
|              Bexley| 2832|
|             Lambeth| 3841|
|Barking and Dagenham| 2396|
+--------------------+-----+
only showing top 5 rows



In [24]:
data.groupBy("borough").agg({"value":"sum"}).show(5)

+--------------------+----------+
|             borough|sum(value)|
+--------------------+----------+
|             Croydon|    2062.0|
|          Wandsworth|    1711.0|
|              Bexley|     880.0|
|             Lambeth|    2164.0|
|Barking and Dagenham|    1109.0|
+--------------------+----------+
only showing top 5 rows



In [26]:
data.groupBy("borough").agg({"value":"sum"}).agg({"sum(value)":"sum"}).show()

+---------------+
|sum(sum(value))|
+---------------+
|        47721.0|
+---------------+



In [27]:
data.groupBy("borough").agg({"value":"sum"}).withColumnRenamed("sum(value)", "convictions").show(5)

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|     2062.0|
|          Wandsworth|     1711.0|
|              Bexley|      880.0|
|             Lambeth|     2164.0|
|Barking and Dagenham|     1109.0|
+--------------------+-----------+
only showing top 5 rows



In [30]:
data.groupBy("borough").agg({"value":"sum"}).withColumnRenamed("sum(value)", "convictions").show(5)

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|     2062.0|
|          Wandsworth|     1711.0|
|              Bexley|      880.0|
|             Lambeth|     2164.0|
|Barking and Dagenham|     1109.0|
+--------------------+-----------+
only showing top 5 rows



In [48]:
data.groupBy("borough").agg({"value":"sum"}).withColumnRenamed("sum(value)", "convictions").agg({"convictions":"sum"})

DataFrame[sum(convictions): double]

In [32]:
data.groupBy("borough").agg({"value":"sum"}).withColumnRenamed("sum(value)", "convictions")

DataFrame[borough: string, convictions: double]

In [33]:
borough_convictions = data.groupBy("borough").agg({"value":"sum"}).withColumnRenamed("sum(value)", "convictions")

In [47]:
borough_convictions.agg({"convictions":"sum"}).show()

+----------------+
|sum(convictions)|
+----------------+
|         47721.0|
+----------------+



In [49]:
borough_convictions.agg({"convictions":"sum"}).collect()[0][0]

47721.0

In [35]:
borough_convictions.show(5)

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|     2062.0|
|          Wandsworth|     1711.0|
|              Bexley|      880.0|
|             Lambeth|     2164.0|
|Barking and Dagenham|     1109.0|
+--------------------+-----------+
only showing top 5 rows



In [36]:
borough_convictions.columns

['borough', 'convictions']

In [37]:
borough_convictions.collect()[0][0]

'Croydon'

In [38]:
borough_convictions.collect()[4][1]

1109.0

In [45]:
data.agg({"value":"sum"})

DataFrame[sum(value): double]