In [1]:
# The first part of the notebook creates tables.
# The second part used pysparkSQL to solve the crime.

# Just like the 'pyspark_solution' notebook it uses pyspark as the data retrieving framework 
# but uses complete Hive statements to deal with the data.

# The statements, taken separately, could be used to retrieve the data from permanent tables using Hive.

# To be able to read and write Hive statements which borrow significant similarity with SQL statements 
# is a useful skill in the area of data science.

In [2]:
# create SparkSession to use sparksql

import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# import data types to cast data and define schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, \
                                DecimalType, FloatType

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL basic example") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode/sql/metadata/hive")\
    .enableHiveSupport()\
    .getOrCreate()

hive_folder = "/Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data"

In [3]:
%rm -rf metastore_db/  # removes any tables that might be presetn from previous runs

In [4]:
# select database to use and create tables in
# 'default' is a pre-created database where we can create tables
spark.sql("use default").show(10, False)  
# (we could have skipped this statement but it makes it more explicit which database we use)

++
||
++
++



In [5]:
spark.sql("show databases").show(10, False)

+---------+
|namespace|
+---------+
|default  |
+---------+



In [6]:
# initially there are no tables 
spark.sql("show tables").show(10, False)

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [7]:
%ls data

crime_type_profit_France.txt          criminals_Germany.csv
crime_type_profit_Germany.txt         criminals_Netherlands.csv
crime_type_profit_Netherlands.txt     criminals_United Kingdom.csv
crime_type_profit_United Kingdom.txt  id_dates.csv
criminals_France.csv


In [8]:
# define a template that till be used to register the 'criminals' table for each country
criminals_country_template="""CREATE EXTERNAL TABLE IF NOT EXISTS criminals_{} (
 id int,
 name string,
 alias string,
 latitude float,
 longitude float,
 country string
    )
STORED AS ORC
LOCATION '{}'
"""

In [9]:
def apply_schema(df, schema):
    """Define data types for data frame columns"""
    
    df = spark.createDataFrame(df.rdd, schema=schema)
    
    return df

def rename_cols(df, new_col_names):
    """"""
    for col, new_col in zip(df.columns, new_col_names):
        df = df.withColumnRenamed(col, new_col)
        
    return df

In [10]:
# get the data

#explore the dataframes: column names, shapes and combine into a single dataframe
country_list = ["United Kingdom", "Germany", "Netherlands", "France"]


schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("alias", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("country", StringType(), True)
])

for country_ in country_list:
    file_name = "./data/criminals_{}.csv".format(country_)
    df = spark.read.csv(file_name, header=True, inferSchema=True)
    print("Country: {}, rows: {}".format(country_, df.count()))
    new_col_names = ["id", "name", "alias", "latitude", "longitude"]
    df = rename_cols(df, new_col_names)
    df = df.withColumn('country', F.lit(country_))
    country_ = "_".join(country_.split()).lower()  # 'United Kingdom' as space in it and thus is an illigal table name
    df = apply_schema(df, schema)
    location = os.path.join(hive_folder, 'criminals_{}'.format(country_))
    df.persist()
    df.write.orc(location, mode="overwrite")
    df.unpersist()
    
    # register external table
    spark.sql(criminals_country_template.format(country_, location))
    
spark.sql("show tables").show(20, False) # isTem

Country: United Kingdom, rows: 306
Country: Germany, rows: 264
Country: Netherlands, rows: 250
Country: France, rows: 349
+--------+------------------------+-----------+
|database|tableName               |isTemporary|
+--------+------------------------+-----------+
|default |criminals_france        |false      |
|default |criminals_germany       |false      |
|default |criminals_netherlands   |false      |
|default |criminals_united_kingdom|false      |
+--------+------------------------+-----------+



In [11]:
# check the table
spark.sql("select * from criminals_united_kingdom").show(3, False)

+---+------------------------+-----+--------+---------+--------------+
|id |name                    |alias|latitude|longitude|country       |
+---+------------------------+-----+--------+---------+--------------+
|0  |Ms. Diane Barnett       |null |51.3327 |-0.0328  |United Kingdom|
|1  |Elizabeth McDonald      |null |51.3732 |-0.0396  |United Kingdom|
|2  |Jacqueline Martin-Winter|null |51.3536 |-0.223   |United Kingdom|
+---+------------------------+-----+--------+---------+--------------+
only showing top 3 rows



# Create crime and profit tables for each country

In [12]:
# just as 

create_table_template="""CREATE EXTERNAL TABLE IF NOT EXISTS crime_type_profit_{} (
 name string,
 crime_type string,
 profit string,
 country string
    )
STORED AS ORC
LOCATION '{}'
"""

# get the data

#explore the dataframes: column names, shapes and combine into a single dataframe
country_list = ["United Kingdom", "Germany", "Netherlands", "France"]


schema = StructType([
    StructField("name", StringType(), True),
    StructField("crime_type", StringType(), True),
    StructField("profit", StringType(), True),
    StructField("country", StringType(), True)
])

for country_ in country_list:
    file_name = "./data/crime_type_profit_{}.txt".format(country_)
    df = spark.read.csv(file_name, header=True, inferSchema=True, sep=" ")
    print("Country: {}, rows: {}".format(country_, df.count()))

    df = df.withColumn('country', F.lit(country_))
    country_ = "_".join(country_.split()).lower()  # 'United Kingdom' has space in it and thus is an illigal table name
    print("country_: ", country_)
    location = os.path.join(hive_folder, 'crime_type_profit_{}'.format(country_))
    print("location: ", location)
    df = apply_schema(df, schema)
    df.persist()
    df.write.orc(location, mode="overwrite")
    df.unpersist()
    
    # register table
    spark.sql(create_table_template.format(country_, location))
    
spark.sql("show tables").show(20, False)


Country: United Kingdom, rows: 306
country_:  united_kingdom
location:  /Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data/crime_type_profit_united_kingdom
Country: Germany, rows: 264
country_:  germany
location:  /Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data/crime_type_profit_germany
Country: Netherlands, rows: 250
country_:  netherlands
location:  /Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data/crime_type_profit_netherlands
Country: France, rows: 349
country_:  france
location:  /Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data/crime_type_profit_france
+--------+--------------------------------+-----------+
|database|tableName                       |isTemporary|
+--------+--------------------------------+-----------+
|default |crime_type_profit_france        |false      |
|default |crime_type_profit_germany       |false      |
|default |crime_type_profit_netherlands   |false      |
|default |crime_type_profit_united_kingdom|false      

# Create the table with crime dates

In [13]:

id_dates = spark.read.csv("./data/id_dates.csv", header=True, inferSchema=True)
print("id_dates shape: {}".format(id_dates.count()))
id_dates.registerTempTable("id_dates")
spark.sql("select count(distinct country) from id_dates").show(1, False)

id_dates shape: 1169
+-----------------------+
|count(DISTINCT country)|
+-----------------------+
|4                      |
+-----------------------+



In [14]:
# check available tables ones more
spark.sql("show tables").show(20, False)

+--------+--------------------------------+-----------+
|database|tableName                       |isTemporary|
+--------+--------------------------------+-----------+
|default |crime_type_profit_france        |false      |
|default |crime_type_profit_germany       |false      |
|default |crime_type_profit_netherlands   |false      |
|default |crime_type_profit_united_kingdom|false      |
|default |criminals_france                |false      |
|default |criminals_germany               |false      |
|default |criminals_netherlands           |false      |
|default |criminals_united_kingdom        |false      |
|        |id_dates                        |true       |
+--------+--------------------------------+-----------+



# THE GOAL

The goal is using the intel (data in the supplied files) from the police, Interpol, and undercover agents about Europe's criminals to identify the name behind which Moriarty is hiding. 

# SOLUTION. PART 1.

PART 1
-Watson, just like our grand-grand-fathers we are again after Moriarty.

We need to catch him... maybe it is her. All we know is that someone is masterminding unlawful activities and planning something bad. The Interpol agents, with the help of my boys, collected information that should provide us the clues to determine the name Moriarty is hiding behind, and arrest him.

-The data is in the csv and text files and contains info on the criminal activity in the last year as well as high-profile and suspicious sales. They were sent over by our collegues from the neighboring countries: France, Germany, Netherlands, and our own MI-6 in the United Kingdom.

-The first task would be to combine the data into one table. I requested the information on the name, alias, and the location of the last known whereabouts of the criminals, as latitude and longitude, but since the data comes from all around the Europe the columns names may differ between files.

-I am thinking that adding the country to the data might be helpful in our future analysis.

-Lastly, from my correspondence with our undercover agents, all the activity seems to be happening around major financial centers. If those are not in the data, I suppose you can extract the city names using the latitude and logitude. And a map of course, unless your knowledge of Europe's geography is excepitonal.

Data tasks outline:
1. Read the data from the files (named 'criminals_' plus country name) into separate dataframes and add the country name as 'country' column.
2. Identify the city around which the criminals operate and add it to the dataframe as 'city' column.
3. Concatenate the dataframes into a single dataframe with the four original columns renamed to: [name, alias, latitude, longitude]
4. Fill NAs in aliases with an empty string.

In [15]:
# union the data for all countries and register the result as separate temporary table
df_criminals_combined = spark.sql("""select * from criminals_france
          union
          select * from criminals_germany
          union
          select * from criminals_netherlands
          union
          select * from criminals_united_kingdom""")

print("criminals_unioned count: {}".format(df_criminals_combined.count()))

df_criminals_combined.registerTempTable("criminals")

df_criminals_combined.show(3)

criminals_unioned count: 1169
+---+--------------------+-----+--------+---------+-------+
| id|                name|alias|latitude|longitude|country|
+---+--------------------+-----+--------+---------+-------+
|  0|Henriette Thomas ...| null| 48.9072|   2.2521| France|
|190|Inès Martins de l...| null| 49.0029|    2.251| France|
|215|         Isaac Labbe| null| 48.8873|   2.3601| France|
+---+--------------------+-----+--------+---------+-------+
only showing top 3 rows



In [16]:
# calculate mean latitude and longitude to identify the major financial centers (cities)
# (copy and paste the lat, lon values into Google Maps)
# dataframe.filter(df['salary'] > 100000).agg({"age": "avg"})

spark.sql("""select country, 
                    AVG(latitude) as avg_lat, 
                    AVG(longitude) as avg_lon
                    from criminals
                    group by country
                    order by country""").show(10, False)


+--------------+------------------+--------------------+
|country       |avg_lat           |avg_lon             |
+--------------+------------------+--------------------+
|France        |48.86060943166301 |2.364566761989648   |
|Germany       |50.097135254831024|8.678964380061988   |
|Netherlands   |52.37530560302734 |4.900951610565185   |
|United Kingdom|51.50456336276983 |-0.12400588218790493|
+--------------+------------------+--------------------+



In [17]:
spark.sql("""select country,
                    ROUND(AVG(latitude), 4) as avg_lat,
                    ROUND(AVG(longitude), 4) as avg_lon
                from criminals
                 group by country
                 order by country""").show(10, False)

+--------------+-------+-------+
|country       |avg_lat|avg_lon|
+--------------+-------+-------+
|France        |48.8606|2.3646 |
|Germany       |50.0971|8.679  |
|Netherlands   |52.3753|4.901  |
|United Kingdom|51.5046|-0.124 |
+--------------+-------+-------+



In [18]:
# add the city name to the df
spark.sql("""select *, 
                case 
                    when country = 'United Kingdom' then 'London'
                    when country = 'France' then 'Paris'
                    when country = 'Germany' then 'Frankfurt'
                    when country = 'Netherlands' then 'Amsterdam'
                end as city
            from criminals""").show(10, False)

+---+-------------------------+-----+--------+---------+-----------+---------+
|id |name                     |alias|latitude|longitude|country    |city     |
+---+-------------------------+-----+--------+---------+-----------+---------+
|0  |Henriette Thomas du Peron|null |48.9072 |2.2521   |France     |Paris    |
|190|Inès Martins de la Morel |null |49.0029 |2.251    |France     |Paris    |
|215|Isaac Labbe              |null |48.8873 |2.3601   |France     |Paris    |
|244|Aimée-Margot Martins     |null |48.885  |2.5288   |France     |Paris    |
|297|Olivier Lopez            |Flako|48.616  |2.1133   |France     |Paris    |
|326|Philippine Traore        |null |49.0171 |2.4338   |France     |Paris    |
|23 |Dipl.-Ing. Dennis Hesse  |null |50.0266 |8.6771   |Germany    |Frankfurt|
|202|Ali Dowerg               |null |50.1855 |8.5548   |Germany    |Frankfurt|
|243|Tomas Mies               |null |50.0033 |8.9106   |Germany    |Frankfurt|
|232|Liam Janssen-Frankhuizen |null |52.5289 |4.8684

In [19]:
# fill nulls with an empty string 
# we'll also assign this new data to a variable name for saving and creating a new table to use later
# (note that 'show' method is moved to the spark dataframe)
criminals_with_city_df = spark.sql("""select id, name,
                case
                    when alias is null then ''
                    else alias
                end as alias,
                country,
                case 
                    when country = 'United Kingdom' then 'London'
                    when country = 'France' then 'Paris'
                    when country = 'Germany' then 'Frankfurt'
                    when country = 'Netherlands' then 'Amsterdam'
                end as city
            from criminals""")

criminals_with_city_df.cache()
criminals_with_city_df.registerTempTable("criminals_with_city")

criminals_with_city_df.show(10, False)

+---+-------------------------+-----+-----------+---------+
|id |name                     |alias|country    |city     |
+---+-------------------------+-----+-----------+---------+
|0  |Henriette Thomas du Peron|     |France     |Paris    |
|190|Inès Martins de la Morel |     |France     |Paris    |
|215|Isaac Labbe              |     |France     |Paris    |
|244|Aimée-Margot Martins     |     |France     |Paris    |
|297|Olivier Lopez            |Flako|France     |Paris    |
|326|Philippine Traore        |     |France     |Paris    |
|23 |Dipl.-Ing. Dennis Hesse  |     |Germany    |Frankfurt|
|202|Ali Dowerg               |     |Germany    |Frankfurt|
|243|Tomas Mies               |     |Germany    |Frankfurt|
|232|Liam Janssen-Frankhuizen |     |Netherlands|Amsterdam|
+---+-------------------------+-----+-----------+---------+
only showing top 10 rows



In [20]:
# check that the data is readable
spark.sql("select distinct country from criminals_with_city").collect()

[Row(country='Germany'),
 Row(country='France'),
 Row(country='United Kingdom'),
 Row(country='Netherlands')]

# Task 2
Add crime_type and profit info to criminals. 
#(merge/join) criminals table with the crime type and profit information.

- Great, Watson! 
- Now we need to know what everyone of those supspects did wrong, that is the crime type, and desirably, how much they profited from it: Moriarty is not a small fish. He is in the category with th largest total sales.

- You'll need to add the crime type and the profit from the files to the table you already put together. Be mindful of the file types. I also believe that the separator in these file maybe different from the files you used previously.
-Moriarty made one of the top 5 sales last year. He is not stupid for nicknames, I am pretty sure he doesn't have an alias.


# Solution (task 2)

In [21]:
# check the tables that currently available
spark.sql("show tables").show(20, False)

+--------+--------------------------------+-----------+
|database|tableName                       |isTemporary|
+--------+--------------------------------+-----------+
|default |crime_type_profit_france        |false      |
|default |crime_type_profit_germany       |false      |
|default |crime_type_profit_netherlands   |false      |
|default |crime_type_profit_united_kingdom|false      |
|default |criminals_france                |false      |
|default |criminals_germany               |false      |
|default |criminals_netherlands           |false      |
|default |criminals_united_kingdom        |false      |
|        |criminals                       |true       |
|        |criminals_with_city             |true       |
|        |id_dates                        |true       |
+--------+--------------------------------+-----------+



In [22]:
crime_profit = spark.sql("""select * from crime_type_profit_france
          union
          select * from crime_type_profit_germany
          union
          select * from crime_type_profit_netherlands
          union
          select * from crime_type_profit_united_kingdom""")

print("crime_profit count: {}".format(crime_profit.count()))

crime_profit.registerTempTable("crime_profit")

crime_profit.show(3)

crime_profit count: 1169
+---------------+-------------+------+-----------+
|           name|   crime_type|profit|    country|
+---------------+-------------+------+-----------+
|    Isaac David|        theft|   204|     France|
|Nikolaj Kallert|pickpocketing|    47|    Germany|
|   Leah Winters|    drug sale|  3510|Netherlands|
+---------------+-------------+------+-----------+
only showing top 3 rows



In [23]:
# table check
spark.sql("select count(distinct country) as country_count from crime_profit").show(1, False)

+-------------+
|country_count|
+-------------+
|4            |
+-------------+



In [24]:
# check columns
spark.sql("select * from crime_profit").columns

['name', 'crime_type', 'profit', 'country']

In [25]:
spark.sql("""select  a.id, a.name, a.alias, b.crime_type, b.profit, b.country, a.city
            from criminals_with_city a
            left join crime_profit b
                on a.name = b.name and a.country = b.country""").show(10, False)

+---+-------------------------+-----+-------------+------+-----------+---------+
|id |name                     |alias|crime_type   |profit|country    |city     |
+---+-------------------------+-----+-------------+------+-----------+---------+
|0  |Henriette Thomas du Peron|     |robbery      |558   |France     |Paris    |
|190|Inès Martins de la Morel |     |pickpocketing|44    |France     |Paris    |
|215|Isaac Labbe              |     |drug sale    |37260 |France     |Paris    |
|244|Aimée-Margot Martins     |     |pickpocketing|26    |France     |Paris    |
|297|Olivier Lopez            |Flako|theft        |318   |France     |Paris    |
|326|Philippine Traore        |     |theft        |320   |France     |Paris    |
|23 |Dipl.-Ing. Dennis Hesse  |     |theft        |44    |Germany    |Frankfurt|
|202|Ali Dowerg               |     |theft        |211   |Germany    |Frankfurt|
|243|Tomas Mies               |     |pickpocketing|27    |Germany    |Frankfurt|
|232|Liam Janssen-Frankhuize

In [26]:
spark.sql("""select count(*) as row_count from (
                select  a.id, a.name, a.alias, b.crime_type, b.profit, b.country
                from criminals a
                left join crime_profit b
                    on a.name = b.name and a.country = b.country)""").show(10, False)

+---------+
|row_count|
+---------+
|1169     |
+---------+



In [27]:
# order by profit (descending) and 
# cast profit as int (to keep the same with pyspark notebook; not necessary here)
spark.sql("""select  a.id, a.name, a.alias, b.crime_type, CAST(b.profit AS INT), b.country
            from criminals a
            left join crime_profit b
                on a.name = b.name and a.country = b.country
            order by profit DESC""").show(10, False)

+---+------------------------+----------+------------+------+--------------+
|id |name                    |alias     |crime_type  |profit|country       |
+---+------------------------+----------+------------+------+--------------+
|302|Odette Renard du Michaud|null      |weapons sale|498000|France        |
|58 |Anthony Mitchell        |null      |weapons sale|495000|United Kingdom|
|307|Gabriel Le Schneider    |null      |weapons sale|493000|France        |
|62 |Malcolm Cox-Mason       |Handlebars|weapons sale|491000|United Kingdom|
|25 |Lily Walter             |Montana   |weapons sale|484000|Netherlands   |
|174|Univ.Prof. Augusta Putz |Kiki      |weapons sale|474000|Germany       |
|203|Gordon Parker           |null      |weapons sale|467000|United Kingdom|
|245|Andrew Smith            |null      |weapons sale|466000|United Kingdom|
|171|Constance du Laurent    |null      |weapons sale|453000|France        |
|200|Valentine Meunier       |null      |weapons sale|435000|France        |

In [28]:
# find the most profitable crime type

spark.sql("""select crime_type, CAST(sum(profit) AS INT) as total_profit
            from criminals a
            left join crime_profit b
                on a.name = b.name and a.country = b.country
            group by crime_type
            order by total_profit DESC""").show(10, False)

+-------------+------------+
|crime_type   |total_profit|
+-------------+------------+
|weapons sale |14942000    |
|drug sale    |2214270     |
|robbery      |96582       |
|theft        |95702       |
|forgery      |37863       |
|pickpocketing|6359        |
+-------------+------------+



In [29]:
# remind ourselves the tables
spark.sql("show tables").show(20, False)

+--------+--------------------------------+-----------+
|database|tableName                       |isTemporary|
+--------+--------------------------------+-----------+
|default |crime_type_profit_france        |false      |
|default |crime_type_profit_germany       |false      |
|default |crime_type_profit_netherlands   |false      |
|default |crime_type_profit_united_kingdom|false      |
|default |criminals_france                |false      |
|default |criminals_germany               |false      |
|default |criminals_netherlands           |false      |
|default |criminals_united_kingdom        |false      |
|        |crime_profit                    |true       |
|        |criminals                       |true       |
|        |criminals_with_city             |true       |
|        |id_dates                        |true       |
+--------+--------------------------------+-----------+



In [30]:
# find the country where with the highest weapons sales

spark.sql("""select a.country, CAST(sum(profit) AS INT) as total_profit
            from criminals a
            left join crime_profit b
                on a.name = b.name and a.country = b.country
            where crime_type = 'weapons sale'
            group by a.country
            order by total_profit DESC
            """).show(10, False)

+--------------+------------+
|country       |total_profit|
+--------------+------------+
|France        |6312000     |
|United Kingdom|3914000     |
|Germany       |2365000     |
|Netherlands   |2351000     |
+--------------+------------+



In [31]:
# add crime and profit and register new table

criminals_with_profit = spark.sql("""select  a.id, a.name, a.alias, b.crime_type, b.profit, b.country
            from criminals_with_city a
            left join crime_profit b
                on a.name = b.name and a.country = b.country""")

criminals_with_profit.cache()
criminals_with_profit.registerTempTable("criminals_with_profit")

In [32]:
spark.sql("show tables").show(20, False)

+--------+--------------------------------+-----------+
|database|tableName                       |isTemporary|
+--------+--------------------------------+-----------+
|default |crime_type_profit_france        |false      |
|default |crime_type_profit_germany       |false      |
|default |crime_type_profit_netherlands   |false      |
|default |crime_type_profit_united_kingdom|false      |
|default |criminals_france                |false      |
|default |criminals_germany               |false      |
|default |criminals_netherlands           |false      |
|default |criminals_united_kingdom        |false      |
|        |crime_profit                    |true       |
|        |criminals                       |true       |
|        |criminals_with_city             |true       |
|        |criminals_with_profit           |true       |
|        |id_dates                        |true       |
+--------+--------------------------------+-----------+



In [33]:
spark.sql("select * from criminals_with_profit").show(5, False)

+---+-------------------------+-----+-------------+------+-------+
|id |name                     |alias|crime_type   |profit|country|
+---+-------------------------+-----+-------------+------+-------+
|0  |Henriette Thomas du Peron|     |robbery      |558   |France |
|190|Inès Martins de la Morel |     |pickpocketing|44    |France |
|215|Isaac Labbe              |     |drug sale    |37260 |France |
|244|Aimée-Margot Martins     |     |pickpocketing|26    |France |
|297|Olivier Lopez            |Flako|theft        |318   |France |
+---+-------------------------+-----+-------------+------+-------+
only showing top 5 rows



In [34]:
# select top 5 to view top criminals in the selected country with selected sales

spark.sql("""select id, name, CAST(profit as INT) 
            from criminals_with_profit 
            where crime_type LIKE 'weapon%' 
                and country = 'France'
            order by profit DESC
            """).show(5, False)

+---+------------------------+------+
|id |name                    |profit|
+---+------------------------+------+
|302|Odette Renard du Michaud|498000|
|307|Gabriel Le Schneider    |493000|
|171|Constance du Laurent    |453000|
|200|Valentine Meunier       |435000|
|19 |René Tessier du Lagarde |423000|
+---+------------------------+------+
only showing top 5 rows



# PART 3

-Watson, I think we got the last piece of the puzzle!

I learned that Moriarty doesn't do his dealings on Sunday.

That means that the top seller (in the country with the top sale in the last year) who didn't sell on a Sunday and who doesn't have an aliase will be him.

All we have to do now is add the date information I just got and determine the weekday for that date. We already know the rest.

And we'll send Lestrade right after him!


In [35]:
spark.sql("select *, weekday(date) as day_num from id_dates").show(5, False)

+---+----------+-------+-------+
|id |date      |country|day_num|
+---+----------+-------+-------+
|0  |2020-06-15|France |0      |
|1  |2020-01-06|France |0      |
|2  |2020-08-03|France |0      |
|3  |2020-06-19|France |4      |
|4  |2020-11-20|France |4      |
+---+----------+-------+-------+
only showing top 5 rows



In [36]:
spark.sql("select *, weekday(date) as day_num from id_dates").registerTempTable("dates_with_day_number")
spark.sql("select * from dates_with_day_number").show(5, False)

+---+----------+-------+-------+
|id |date      |country|day_num|
+---+----------+-------+-------+
|0  |2020-06-15|France |0      |
|1  |2020-01-06|France |0      |
|2  |2020-08-03|France |0      |
|3  |2020-06-19|France |4      |
|4  |2020-11-20|France |4      |
+---+----------+-------+-------+
only showing top 5 rows



In [37]:
# use a known date to determine how day numbering is setup by default
# 2020-12-14 is Monday
spark.sql("select weekday('2020-12-14') as num").show(1, False)

+---+
|num|
+---+
|0  |
+---+



In [38]:
spark.sql("""select *, 
            case day_num
             when 0  then 'Monday'
             when 1 then'Tuesday'
             when 2 then 'Wednesday'
             when 3 then 'Thursday'
             when 4 then'Friday'
             when 5 then 'Saturday'
             when 6 then 'Sunday'
        end as weekday
        from dates_with_day_number""").show(10, False)


+---+----------+-------+-------+--------+
|id |date      |country|day_num|weekday |
+---+----------+-------+-------+--------+
|0  |2020-06-15|France |0      |Monday  |
|1  |2020-01-06|France |0      |Monday  |
|2  |2020-08-03|France |0      |Monday  |
|3  |2020-06-19|France |4      |Friday  |
|4  |2020-11-20|France |4      |Friday  |
|5  |2020-01-11|France |5      |Saturday|
|6  |2020-04-23|France |3      |Thursday|
|7  |2020-09-27|France |6      |Sunday  |
|8  |2020-05-08|France |4      |Friday  |
|9  |2020-04-09|France |3      |Thursday|
+---+----------+-------+-------+--------+
only showing top 10 rows



In [39]:
# select only needed columns and register a table
df = spark.sql("""select id, date,
            case day_num
             when 0  then 'Monday'
             when 1 then'Tuesday'
             when 2 then 'Wednesday'
             when 3 then 'Thursday'
             when 4 then'Friday'
             when 5 then 'Saturday'
             when 6 then 'Sunday'
        end as weekday,
        country
        from dates_with_day_number""")

df.registerTempTable("id_dates_with_weekday")

df.show(10, False)

+---+----------+--------+-------+
|id |date      |weekday |country|
+---+----------+--------+-------+
|0  |2020-06-15|Monday  |France |
|1  |2020-01-06|Monday  |France |
|2  |2020-08-03|Monday  |France |
|3  |2020-06-19|Friday  |France |
|4  |2020-11-20|Friday  |France |
|5  |2020-01-11|Saturday|France |
|6  |2020-04-23|Thursday|France |
|7  |2020-09-27|Sunday  |France |
|8  |2020-05-08|Friday  |France |
|9  |2020-04-09|Thursday|France |
+---+----------+--------+-------+
only showing top 10 rows



In [40]:
df = spark.sql("""select a.*, b.weekday from 
                criminals_with_profit a
                inner join id_dates_with_weekday b 
                on a.id = b.id 
                and a.country = b.country""")

df.show(5, False)

+---+-------------------------+-----+-------------+------+-------+--------+
|id |name                     |alias|crime_type   |profit|country|weekday |
+---+-------------------------+-----+-------------+------+-------+--------+
|0  |Henriette Thomas du Peron|     |robbery      |558   |France |Monday  |
|190|Inès Martins de la Morel |     |pickpocketing|44    |France |Friday  |
|215|Isaac Labbe              |     |drug sale    |37260 |France |Thursday|
|244|Aimée-Margot Martins     |     |pickpocketing|26    |France |Friday  |
|297|Olivier Lopez            |Flako|theft        |318   |France |Sunday  |
+---+-------------------------+-----+-------------+------+-------+--------+
only showing top 5 rows



In [41]:
df.registerTempTable("criminals_with_weekday")

In [42]:
# Show top 5 salesmen in the selected country
df_final = spark.sql("""select name, CAST(profit AS INT)
                            from criminals_with_weekday
                            where
                                country = 'France' 
                                and alias = '' 
                                and crime_type = 'weapons sale'
                                and weekday != 'Sunday'
                                order by profit DESC""")

df_final.show(10, False)

+------------------------+------+
|name                    |profit|
+------------------------+------+
|Odette Renard du Michaud|498000|
|Constance du Laurent    |453000|
|René Tessier du Lagarde |423000|
|Zoé Guibert de la Levy  |364000|
|Denis Lesage            |328000|
|Henriette Charpentier   |282000|
|Thomas Couturier        |267000|
|daisy Toussaint         |266000|
|Suzanne Levy            |216000|
|Zacharie Samson         |208000|
+------------------------+------+
only showing top 10 rows



In [43]:

moriarty_name =  df_final.select("name").collect()[0][0]
print("The name Moriarty is hiding behind: {}".format(moriarty_name))

The name Moriarty is hiding behind: Odette Renard du Michaud


In [44]:
# moriarty_name = Odette Renard du Michaud