In [None]:
# The first part of the notebook creates the tables with the data.
# The second part uses pysparkSQL to solve the puzzle.

# Just like the 'pyspark_solution' notebook, the notebook uses pyspark as the data retrieving framework 
# but uses complete Hive statements to select the data.

# The statements, taken separately, could be used to retrieve the data from permanent tables using pure Hive.

# To be able to read and write Hive statements which borrow significant similarity with SQL statements 
# is a useful skill in the area of data science.

In [None]:
# create SparkSession to use pysparkSQL

import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# import data types to cast data and define schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, \
                                DecimalType, FloatType

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL basic example") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode/sql/metadata/hive")\
    .enableHiveSupport()\
    .getOrCreate()

hive_folder = "/Users/vk/Documents/Python/holmes_moriarty_sql/src/hive_data"

In [None]:
%rm -rf metastore_db/  # removes any tables that might be presetn from previous runs

In [None]:
# select database to use and create tables in
# 'default' is a pre-created database where we can create tables
spark.sql("use default").show(10, False)  
# (we could have skipped this statement but it makes it more explicit which database we use)

In [None]:
spark.sql("show databases").show(10, False)

In [None]:
# initially there are no tables 
spark.sql("show tables").show(10, False)

In [None]:
%ls data

In [None]:
# define a template that till be used to register the 'criminals' table for each country
criminals_country_template="""CREATE EXTERNAL TABLE IF NOT EXISTS criminals_{} (
 id int,
 name string,
 alias string,
 latitude float,
 longitude float,
 country string
    )
STORED AS ORC
LOCATION '{}'
"""

In [None]:
def apply_schema(df, schema):
    """Define data types for data frame columns"""
    
    df = spark.createDataFrame(df.rdd, schema=schema)
    
    return df

def rename_cols(df, new_col_names):
    """"""
    for col, new_col in zip(df.columns, new_col_names):
        df = df.withColumnRenamed(col, new_col)
        
    return df

In [None]:
# get the data

#explore the dataframes: column names, shapes and combine into a single dataframe
country_list = ["United Kingdom", "Germany", "Netherlands", "France"]


schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("alias", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("country", StringType(), True)
])

for country_ in country_list:
    file_name = "./data/criminals_{}.csv".format(country_)
    df = spark.read.csv(file_name, header=True, inferSchema=True)
    print("Country: {}, rows: {}".format(country_, df.count()))
    new_col_names = ["id", "name", "alias", "latitude", "longitude"]
    df = rename_cols(df, new_col_names)
    df = df.withColumn('country', F.lit(country_))
    country_ = "_".join(country_.split()).lower()  # 'United Kingdom' as space in it and thus is an illigal table name
    df = apply_schema(df, schema)
    location = os.path.join(hive_folder, 'criminals_{}'.format(country_))
    df.persist()
    df.write.orc(location, mode="overwrite")
    df.unpersist()
    
    # register external table
    spark.sql(criminals_country_template.format(country_, location))
    
spark.sql("show tables").show(20, False) # isTem

In [None]:
# check the table
spark.sql("select * from criminals_united_kingdom").show(3, False)

# Create crime and profit tables for each country

In [None]:
# just as 

create_table_template="""CREATE EXTERNAL TABLE IF NOT EXISTS crime_type_profit_{} (
 name string,
 crime_type string,
 profit string,
 country string
    )
STORED AS ORC
LOCATION '{}'
"""

# get the data

#explore the dataframes: column names, shapes and combine into a single dataframe
country_list = ["United Kingdom", "Germany", "Netherlands", "France"]


schema = StructType([
    StructField("name", StringType(), True),
    StructField("crime_type", StringType(), True),
    StructField("profit", StringType(), True),
    StructField("country", StringType(), True)
])

for country_ in country_list:
    file_name = "./data/crime_type_profit_{}.txt".format(country_)
    df = spark.read.csv(file_name, header=True, inferSchema=True, sep=" ")
    print("Country: {}, rows: {}".format(country_, df.count()))

    df = df.withColumn('country', F.lit(country_))
    country_ = "_".join(country_.split()).lower()  # 'United Kingdom' has space in it and thus is an illigal table name
    print("country_: ", country_)
    location = os.path.join(hive_folder, 'crime_type_profit_{}'.format(country_))
    print("location: ", location)
    df = apply_schema(df, schema)
    df.persist()
    df.write.orc(location, mode="overwrite")
    df.unpersist()
    
    # register table
    spark.sql(create_table_template.format(country_, location))
    
spark.sql("show tables").show(20, False)


# Create the table with crime dates

In [None]:

id_dates = spark.read.csv("./data/id_dates.csv", header=True, inferSchema=True)
print("id_dates shape: {}".format(id_dates.count()))
id_dates.registerTempTable("id_dates")
spark.sql("select count(distinct country) from id_dates").show(1, False)

In [None]:
# check available tables ones more
spark.sql("show tables").show(20, False)

# THE GOAL

The goal is to use the intel (data in the supplied files) about Europe's criminals from the police, Interpol, and undercover agents  to identify the name behind which Moriarty is hiding. 

# SOLUTION

# PART 1 

-Watson, just like our great-grandfathers we are after Moriarty again.

We need to catch him... or maybe it is a her. All we know is that someone is masterminding unlawful activities and planning something terrible. The Interpol agents, with the help of my informants, collected information that should provide us the clues to determine the name Moriarty is hiding behind, and arrest him.

-The data is in the csv and text files and contains info on the criminal activity in the last year as well as high-profile and suspicious sales. They were sent over by our collegues from the neighboring countries: France, Germany, Netherlands, and our own MI-6 in the United Kingdom.

-The first task would be to combine the data into one table. I requested the information on the name, alias, and the location of the last known whereabouts of the criminals, as latitude and longitude, but since the data comes from all around the Europe the columns names may differ between files.

-I am thinking that adding the country of origin to the data might be helpful in our future analysis.

-Lastly, from my correspondence with our undercover agents, all the activity seems to be happening around major financial centers. If those centers are not in the data, I suppose you can extract the city names using the latitude and logitude. And a map of course, unless your knowledge of Europe's geography is excepitonal.

Data tasks outline:
1. Read the data from the files (named 'criminals_' plus country name) into separate dataframes and add the country name as 'country' column.
2. Identify the city around which the criminals operate and add it to the dataframe as 'city' column.
3. Concatenate the dataframes into a single dataframe with the four original columns renamed to: [name, alias, latitude, longitude]
4. Fill NAs in aliases with an empty string.

In [None]:
# use complete 'select' SQL statements as part of the pysparkSQL. A general example:
temp_df = spark.sql("""select * from criminals_france 
                            where name like '%a'
                            order by name""")
temp_df.registerTempTable("temp_df")
temp_df.show(2, False)

In [None]:
# your code below

# Task 2 
Add crime_type and profit info to criminals. 
#(merge/join) criminals table with the crime type and profit information.


-Great, Watson! 

-Now we need to know what each one did wrong, that is the crime type, and desirably, how much they profited from it: Moriarty is no small fish. He is in the category with the largest total sales.

-You'll need to add the crime type and the profit from the files to the table you already put together. Be mindful of the file types. I also believe that the separator in these file maybe different from the files you used previously. 

-Moriarty made one of the top 5 sales last year. He is not stupid for nicknames, I am pretty sure he doesn't have an alias.



# Solution (task 2)

In [None]:
# your code below

# PART 3

-Watson, I think we got the last piece of the puzzle!

-I just learnt that Moriarty doesn't do his dealings on Sunday.

-That means that the top seller (in the country with the top sale in the last year) who didn't sell on a Sunday and who doesn't have an alias will be him.

-All we have to do now is add the date information I just got and determine the weekday for that date. We already know the rest.

-And we'll send Lestrade right after him!


In [None]:
# your code below

In [None]:
# moriarty_name = ?