In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import * # pyspark functions
import urllib # URL processing

# set auth credentials from 'Delta table'
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"
keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
%sql
-- Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [0]:
df_pin = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName','streaming-126ca3664fbb-pin') \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
df_pin = df_pin.selectExpr("CAST(data as STRING)") # to decode Data back (from base64?) into readable format

df_geo = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName','streaming-126ca3664fbb-geo') \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
df_geo = df_geo.selectExpr("CAST(data as STRING)") # as above

df_user = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName','streaming-126ca3664fbb-user') \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
df_user = df_user.selectExpr("CAST(data as STRING)") # as above

# display(df_pin) # check this one because of the duff item of data (['Maya',...], etc)


data
"{""index"":1,""name"":""Maya"",""age"":25,""role"":""engineer""}"
"{""index"":7528,""unique_id"":""fbe53c66-3442-4773-b19e-d3ec6f54dddf"",""title"":""No Title Data Available"",""description"":""No description available Story format"",""poster_name"":""User Info Error"",""follower_count"":""User Info Error"",""tag_list"":""N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"",""is_image_or_video"":""multi-video(story page format)"",""image_src"":""Image src error."",""downloaded"":0,""save_location"":""Local save in /data/mens-fashion"",""category"":""mens-fashion""}"
"{""index"":{""index"":7528,""unique_id"":""fbe53c66-3442-4773-b19e-d3ec6f54dddf"",""title"":""No Title Data Available"",""description"":""No description available Story format"",""poster_name"":""User Info Error"",""follower_count"":""User Info Error"",""tag_list"":""N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"",""is_image_or_video"":""multi-video(story page format)"",""image_src"":""Image src error."",""downloaded"":0,""save_location"":""Local save in /data/mens-fashion"",""category"":""mens-fashion""}}"
"{""index"":{""index"":2863,""unique_id"":""9bf39437-42a6-4f02-99a0-9a0383d8cd70"",""title"":""25 Super Fun Summer Crafts for Kids - Of Life and Lisa"",""description"":""Keep the kids busy this summer with these easy diy crafts and projects. Creative and‚Ä¶"",""poster_name"":""Of Life & Lisa | Lifestyle Blog"",""follower_count"":""124k"",""tag_list"":""Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg"",""downloaded"":1,""save_location"":""Local save in /data/diy-and-crafts"",""category"":""diy-and-crafts""}}"
"{""index"":{""index"":5730,""unique_id"":""1e1f0c8b-9fcf-460b-9154-c775827206eb"",""title"":""Island Oasis Coupon Organizer"",""description"":""Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the \""basic\"" couponer - holds up to 500 coupons with ease, and is made long enough so that you‚Ä¶¬†"",""poster_name"":""Consuelo Aguirre"",""follower_count"":""0"",""tag_list"":""Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg"",""downloaded"":1,""save_location"":""Local save in /data/finance"",""category"":""finance""}}"
"{""index"":{""index"":8304,""unique_id"":""5b6d0913-25e4-43ab-839d-85d5516f78a4"",""title"":""The #1 Reason You‚Äôre Not His Priority Anymore - Matthew Coast"",""description"":""#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself"",""poster_name"":""Commitment Connection"",""follower_count"":""51k"",""tag_list"":""Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png"",""downloaded"":1,""save_location"":""Local save in /data/quotes"",""category"":""quotes""}}"
"{""index"":{""index"":8731,""unique_id"":""ea760f71-febf-4023-b592-d17396659039"",""title"":""20 Koi Fish Tattoos For Lucky Men"",""description"":""Koi fish tattoos are a popular choice for men who want to make a statement, thanks to their rich symbolism and bold design."",""poster_name"":""TheTrendSpotter"",""follower_count"":""211k"",""tag_list"":""Dr Tattoo,W√∂rter Tattoos,Pisces Tattoos,Tatoo Art,Dream Tattoos,Dope Tattoos,Mini Tattoos,Finger Tattoos,Body Art Tattoos"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/8a/0c/0a/8a0c0a7b6236565c519acd41ad1a52c0.jpg"",""downloaded"":1,""save_location"":""Local save in /data/tattoos"",""category"":""tattoos""}}"
"{""index"":{""index"":1313,""unique_id"":""44662045-e891-4821-8a19-ebe7eedd371a"",""title"":""Liquid Lash Extensions Mascara"",""description"":""Instantly create the look of lash extensions with this award-winning, best-selling mascara that won't clump, flake or smudge. Available in 3 shades!"",""poster_name"":""Thrive Causemetics"",""follower_count"":""43k"",""tag_list"":""N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"",""is_image_or_video"":""video"",""image_src"":""https://i.pinimg.com/videos/thumbnails/originals/69/84/e2/6984e20f3e262098fa9c0614c3453254.0000001.jpg"",""downloaded"":1,""save_location"":""Local save in /data/beauty"",""category"":""beauty""}}"
"{""index"":{""index"":4315,""unique_id"":""21b59ba9-829d-4c33-8c27-4cd4c56d26b8"",""title"":""Podcasts for Teachers or Parents of Teenagers"",""description"":""Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng‚Ä¶¬†"",""poster_name"":""Math Giraffe"",""follower_count"":""25k"",""tag_list"":""Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg"",""downloaded"":1,""save_location"":""Local save in /data/education"",""category"":""education""}}"
"{""index"":{""index"":10794,""unique_id"":""c4bd2577-a7bb-4409-bb7a-17d5ed7e1cf1"",""title"":""TireBuyer"",""description"":""Nissan GT-R. Sick."",""poster_name"":""Ray Uyemura"",""follower_count"":""437"",""tag_list"":""Lowrider,Old Vintage Cars,Antique Cars,Austin Martin,Nissan Gtr Black,Jaguar,1959 Cadillac,Cadillac Ct6,Old School Cars"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/0d/29/9f/0d299f3df020395aa7ce8387f40fbeed.jpg"",""downloaded"":1,""save_location"":""Local save in /data/vehicles"",""category"":""vehicles""}}"


In [0]:
# function to convert string numbers to ints, including those with k/M units
def convert_to_int(value):
    if type(value) == int:
        return value
    # elif type(value) == float:
    #     return value
    else:
        try:
            value = value.strip()
            if value.endswith('k'):
                return int(float(value[:-1]) * 1000)
            elif value.endswith('M'):
                return int(float(value[:-1]) * 1000000)
            else:
                return int(value)
        except ValueError:
            return None

# custom UDF from the function
convert_to_int_udf = udf(lambda x: convert_to_int(x), IntegerType())


######## CLEAN PIN ########

# first cleaning empty strings to None
df_pin = df_pin.select(
    [
        when(trim(col(c)) == "", lit(None)).otherwise(col(c)).alias(c) for c in df_pin.columns
    ]
)

# applying the UDF to the 1 (string) numerical columns
df_pin = df_pin.withColumn(
    "follower_count",
    when(col("follower_count").isNotNull(), convert_to_int_udf(col("follower_count")))
    .otherwise(None)
)

# rename and reorder columns
df_pin = df_pin.withColumnRenamed("index", "ind")

# reordering  columns
df_pin = df_pin.select("ind", "unique_id", "title",
               "description", "follower_count",
               "poster_name", "tag_list",
               "is_image_or_video", "image_src",
               "save_location", "category",
               "downloaded") # this last one is actually missing from the instructions

######## CLEAN GEO ########

# producing geospatial array and dropping original 2 columns
df_geo = df_geo.withColumn("coordinates",
                           array(col("latitude"), col("longitude"))
                           ).drop("latitude", "longitude")

# convert the datetime string to proper datetime format
df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss"))

# reordering
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")


######## CLEAN USER ########


# combine first and last names then drop original columns
df_user = df_user.withColumn('user_name',
                             concat_ws(' ',
                                       col('first_name'),
                                       col('last_name'))).drop('first_name', 'last_name')

# convert date_joined to timestamp format
df_user = df_user.withColumn("date_joined", to_timestamp("date_joined", "yyyy-MM-dd'T'HH:mm:ss"))

# reordering
df_user = df_user.select("ind", "user_name", "age", "date_joined")

######## CALL DFS ########
df_pin
df_geo
df_user

In [0]:

df_pin.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("126ca3664fbb_pin_table")

df_geo.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("126ca3664fbb_geo_table")

df_user.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("126ca3664fbb_user_table")