# Glide Technical Exercise  

## 1) Import CSV files to pyspark data frame

In [23]:
# required classes
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
# set up configuration
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('glide') \
    .set("spark.jars", "/home/canovasjm/spark/spark-3.0.3-bin-hadoop3.2/jars/gcs-connector-hadoop3-latest.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "path/to/google_credentials.json")

sc = SparkContext(conf=conf)

sc._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", "/home/canovasjm/.google/credentials/google_credentials.json")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")

22/03/03 00:44:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# build Spark session
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [12]:
# read CSV files located in GCS bucket
df = spark.read \
    .option("header", "true") \
    .csv("gs://dtc_data_lake_deng-338919/glide/*")

                                                                                

In [28]:
# print first 10 rows to check if df looks fine
df.show(10)

[Stage 13:>                                                         (0 + 1) / 1]

+-------------+---------------+------+----------+---------+------+--------------------+------------+------+----------------+
|snapshot_date|employee_number|status|first_name|last_name|gender|               email|phone_number|salary|termination_date|
+-------------+---------------+------+----------+---------+------+--------------------+------------+------+----------------+
|   2020-01-05|              1|Active| Frederick|   Barnes|  Male|f.barnes@newmail.com| 094-8926-78| 38582|            null|
|   2020-01-05|              2|Active|    Alford|    Grant|  Male| a.grant@newmail.com| 389-8947-85| 53126|            null|
|   2020-01-05|              3|Active|    Sydney|  Farrell|Female|s.farrell@newmail...| 187-8343-84|151217|            null|
|   2020-01-05|              3|Active|    Sydney|  Farrell|Female|s.farrell@newmail...| 187-8343-84|151217|            null|
|   2020-01-05|              4|Active|     Rosie| Richards|Female|r.richards@newmai...| 357-9337-53|162461|            null|


                                                                                

In [15]:
# print the shape of df
print((df.count(), len(df.columns)))



(495, 10)


                                                                                

In [21]:
# check the df schema. As we see, all columns are read as string
df.schema

StructType(List(StructField(snapshot_date,StringType,true),StructField(employee_number,StringType,true),StructField(status,StringType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(gender,StringType,true),StructField(email,StringType,true),StructField(phone_number,StringType,true),StructField(salary,StringType,true),StructField(termination_date,StringType,true)))

So far our all columns in our data frame are of type string. This is not optimal and won't allow us to do further manipulations of this data frame, so we need to fix it before moving on

In [24]:
from pyspark.sql import types

In [25]:
# define a proper schema
schema = types.StructType([
    types.StructField('snapshot_date', types.DateType(), True),
    types.StructField('employee_number', types.IntegerType(), True),
    types.StructField('status', types.StringType(), True),
    types.StructField('first_name', types.StringType(), True),
    types.StructField('last_name', types.StringType(), True),
    types.StructField('gender', types.StringType(), True),
    types.StructField('email', types.StringType(), True),
    types.StructField('phone_number', types.StringType(), True),
    types.StructField('salary', types.IntegerType(), True),
    types.StructField('termination_date', types.DateType(), True)
])

In [26]:
# read CSV files located in GCS bucket again, this time specifying the schema we defined above
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("gs://dtc_data_lake_deng-338919/glide/*")

In [27]:
# check the schema
df.schema

StructType(List(StructField(snapshot_date,DateType,true),StructField(employee_number,IntegerType,true),StructField(status,StringType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(gender,StringType,true),StructField(email,StringType,true),StructField(phone_number,StringType,true),StructField(salary,IntegerType,true),StructField(termination_date,DateType,true)))

## 2)  Some data exploration

In [29]:
df.filter("termination_date IS NOT NULL").show()



+-------------+---------------+--------+----------+---------+------+------------------+------------+------+----------------+
|snapshot_date|employee_number|  status|first_name|last_name|gender|             email|phone_number|salary|termination_date|
+-------------+---------------+--------+----------+---------+------+------------------+------------+------+----------------+
|   2020-01-04|             25|Inactive|    Stella|     Hunt|Female|s.hunt@newmail.com| 194-7397-62|122746|      2020-01-04|
+-------------+---------------+--------+----------+---------+------+------------------+------------+------+----------------+



                                                                                

In [31]:
df.filter("employee_number IS NULL").count()

                                                                                

0