In [0]:
#Mount the S3 bucket containing the Pinterest data to Databricks using an authentication key
# pyspark functions
import pyspark.sql.functions as F

# URL processing
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")


#Mount the S3 Bucket
# AWS S3 bucket name
AWS_S3_BUCKET = "user-12d6e5017cf5-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/pinterest-bucket"
# Check if the directory is already mounted
if not any(mount.mountPoint == MOUNT_NAME for mount in dbutils.fs.mounts()):
    # Source url
    SOURCE_URL = f"s3n://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_S3_BUCKET}"
    # Mount the drive
    dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)
else:
    print(f"The directory {MOUNT_NAME} is already mounted.")

In [0]:
# Read in data from each of the tables in the S3 bucket

# Disable format checks during the reading of Delta tables
spark.conf.set("spark.databricks.delta.formatCheck.enabled", "false")

# Read in pin table
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pinterest-bucket/topics/12d6e5017cf5.pin/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_pin)

# Read in geo table
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pinterest-bucket/topics/12d6e5017cf5.geo/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_geo)

# Read in user table
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pinterest-bucket/topics/12d6e5017cf5.user/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_user)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d
diy-and-crafts,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",1,892k,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,3156,image,Michelle {CraftyMorning.com},Local save in /data/diy-and-crafts,"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",Handprint Reindeer Ornaments - Crafty Morning,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4
finance,"If you love budgeting, make sure to give Dave Ramsey's 7 Baby Steps a try. Follow these steps to begin your debt snowball, build an emergency fund, invest and reach riches. I ca…",1,26k,https://i.pinimg.com/originals/1e/9d/90/1e9d906e4e150e3b95187f3b76ea7c71.png,5494,image,"Living Low Key | Save Money, Make Money, & Frugal Living",Local save in /data/finance,"Financial Peace,Financial Tips,Saving Money Quotes,Total Money Makeover,Budgeting Finances,Money Management,Wealth Management,Personal Finance,Making Ideas",Dave Ramsey's 7 Baby Steps: What Are They And Will They Work For You,8fb2af68-543b-4639-8119-de33d28706ed
finance,"If you love budgeting, make sure to give Dave Ramsey's 7 Baby Steps a try. Follow these steps to begin your debt snowball, build an emergency fund, invest and reach riches. I ca…",1,26k,https://i.pinimg.com/originals/1e/9d/90/1e9d906e4e150e3b95187f3b76ea7c71.png,5494,image,"Living Low Key | Save Money, Make Money, & Frugal Living",Local save in /data/finance,"Financial Peace,Financial Tips,Saving Money Quotes,Total Money Makeover,Budgeting Finances,Money Management,Wealth Management,Personal Finance,Making Ideas",Dave Ramsey's 7 Baby Steps: What Are They And Will They Work For You,8fb2af68-543b-4639-8119-de33d28706ed
christmas,Here are the best DIY Christmas Centerpieces ideas perfect for your Christmas & holiday season home decor. From Christmas Vignettes to Table Centerpieces.,1,500k,https://i.pinimg.com/originals/aa/6d/0f/aa6d0f44d7c1c96b998cb9aa6c4446b8.png,2418,image,HikenDip,Local save in /data/christmas,"Farmhouse Christmas Decor,Rustic Christmas,Christmas Time,Vintage Christmas,Xmas,Primitive Christmas Crafts,Christmas Vignette,Indoor Christmas Decorations,Diy Christmas Ornaments",100 DIY Christmas Centerpieces You'll Love To Decorate Your Home With For The Christmas Season - Hike n Dip,da8745a6-5160-46c4-877d-181d50a729fd
quotes,summcoco gives you inspiration for the women fashion trends you want. Thinking about a new look or lifestyle? This is your ultimate resource to get the hottest trends. 45 Top Li…,1,306k,https://i.pinimg.com/originals/bb/c0/e6/bbc0e6a797079505f11ac12bcb0b8c66.jpg,7922,image,"Sumcoco | Decor Ideas, Hairstyles, Nails Fashion Advice",Local save in /data/quotes,"Life Quotes Love,Inspirational Quotes About Love,Mood Quotes,Motivational Quotes,Tears Quotes,Quotes About Sadness,Deep Quotes About Life,Quotes Quotes,Quote Life",45 Top Life Quotes School Did Not Teach You,a584581c-1b38-4731-a1cc-f36115ecf229
travel,"Are you traveling to Paris during the summer? Find out what to do in Paris, France during the summer. Fun summertime activities in Paris. Enjoy the incredible outdoors when trav…",1,3k,https://i.pinimg.com/originals/6c/4c/90/6c4c90bba27ebf8c8bfe4c1acfb9f07a.jpg,9979,image,Petite in Paris,Local save in /data/travel,"Torre Eiffel Paris,Tour Eiffel,Picnic In Paris,Hello France,Voyage Europe,Destination Voyage,Beautiful Places To Travel,Travel Aesthetic,Paris Travel",Paris in the Summer. 10 fun things to do in Paris in the Summertime • Petite in Paris,2b2abc85-fc51-481f-8ae6-17681993da28
home-decor,"Holiday mantle decor, Christmas decor, metallic mercury glass style Christmas trees, eucalyptus vine, evergreen pine branches, white neutral holiday decor, cozy mantle for the h…",1,83k,https://i.pinimg.com/originals/9d/82/1a/9d821a80acd8f90c16454e978bd9b115.jpg,6145,image,Stylin by Aylin,Local save in /data/home-decor,"Winter Home Decor,Christmas Living Room Decor,Living Room Decor Cozy,Christmas Decor,Cozy Fireplace,Rustic Fireplace Decor,Fireplace Decorations,Rustic Room,House Decorations",HOLIDAY MANTLE DECOR - @AMAZON & @TARGET FINDS,82e13a07-db99-43a3-b1c0-89a4b75821da
education,"Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng…",1,25k,https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg,4315,image,Math Giraffe,Local save in /data/education,"Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom",Podcasts for Teachers or Parents of Teenagers,21b59ba9-829d-4c33-8c27-4cd4c56d26b8
education,"Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng…",1,25k,https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg,4315,image,Math Giraffe,Local save in /data/education,"Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom",Podcasts for Teachers or Parents of Teenagers,21b59ba9-829d-4c33-8c27-4cd4c56d26b8


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27T11:30:59
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01T02:26:50
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01T02:26:50
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01T02:26:50
Central African Republic,2074,-52.3213,-50.11,2019-11-03T05:41:59
Antigua and Barbuda,7922,-88.0974,-172.052,2021-01-27T09:14:19
Dominican Republic,9979,14.9967,-120.682,2018-07-18T19:01:46
Cote d'Ivoire,2923,-84.6302,-164.507,2019-09-08T22:53:09
French Guiana,8304,-28.8852,-164.87,2019-09-13 04:50:29
French Guiana,8304,-28.8852,-164.87,2019-09-13T04:50:29


age,date_joined,first_name,ind,last_name
21,2015-11-10T09:27:42,Andrea,8731,Alexander
21,2015-11-10T09:27:42,Andrea,8731,Alexander
21,2015-11-10T09:27:42,Andrea,8731,Alexander
24,2016-03-31T20:56:39,Austin,8887,Rodriguez
36,2015-12-20T16:38:13,Michelle,4315,Prince
36,2015-12-20T16:38:13,Michelle,4315,Prince
36,2015-12-20T16:38:13,Michelle,4315,Prince
32,2017-10-10T20:09:33,Christian,10625,Lang
22,2016-02-11T20:46:04,Jennifer,9672,Hudson
32,2016-04-02T03:51:23,Brittany,1313,Jones


In [0]:
#View the starting columns and data types in the table
df_pin.printSchema()

In [0]:
#Clean df_pin dataframe

#Replace entries with no relevant data in each column with 'None'
df_pin = df_pin.replace({
    'No description available Story format':None,
    'User Info Error':None, 
    'Image src error.':None,
    'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e':None, 
    'No Title Data Available':None,
    '':None,
    'null':None
    })
#Replace all empty entries with 'None'
df_pin = df_pin.fillna(value='None')

#Transform follower_count to ensure every entry is a number, and cast to integer data type
df_pin = df_pin.withColumn('follower_count', F.regexp_replace('follower_count', 'k', '000').cast('int'))
display(df_pin)

#Cast downloaded and index to integer data type 
df_pin = df_pin.withColumn('downloaded', df_pin['downloaded'].cast('int'))
df_pin = df_pin.withColumn('index', df_pin['index'].cast('int'))

#Clean the data in the save_location column to include only the save location path
df_pin = df_pin.withColumn('save_location', F.regexp_replace('save_location', 'Local save in ', ''))

#Rename the index column to ind.
df_pin = df_pin.withColumnRenamed('index','ind')

#Reorder the DataFrame columns to have the following column order:
df_pin = df_pin[['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name', 'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category']]

#Print the column names and their data types
df_pin.printSchema()
display(df_pin)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
education,"Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng…",1,25000.0,https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg,4315,image,Math Giraffe,Local save in /data/education,"Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom",Podcasts for Teachers or Parents of Teenagers,21b59ba9-829d-4c33-8c27-4cd4c56d26b8
finance,"Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the ""basic"" couponer - holds up to 500 coupons with ease, and is made long enough so that you…",1,0.0,https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg,5730,image,Consuelo Aguirre,Local save in /data/finance,"Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons",Island Oasis Coupon Organizer,1e1f0c8b-9fcf-460b-9154-c775827206eb
finance,"Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the ""basic"" couponer - holds up to 500 coupons with ease, and is made long enough so that you…",1,0.0,https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg,5730,image,Consuelo Aguirre,Local save in /data/finance,"Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons",Island Oasis Coupon Organizer,1e1f0c8b-9fcf-460b-9154-c775827206eb
diy-and-crafts,Keep the kids busy this summer with these easy diy crafts and projects. Creative and…,1,124000.0,https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg,2863,image,Of Life & Lisa | Lifestyle Blog,Local save in /data/diy-and-crafts,"Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls",25 Super Fun Summer Crafts for Kids - Of Life and Lisa,9bf39437-42a6-4f02-99a0-9a0383d8cd70
diy-and-crafts,Keep the kids busy this summer with these easy diy crafts and projects. Creative and…,1,124000.0,https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg,2863,image,Of Life & Lisa | Lifestyle Blog,Local save in /data/diy-and-crafts,"Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls",25 Super Fun Summer Crafts for Kids - Of Life and Lisa,9bf39437-42a6-4f02-99a0-9a0383d8cd70
tattoos,"Koi fish tattoos are a popular choice for men who want to make a statement, thanks to their rich symbolism and bold design.",1,211000.0,https://i.pinimg.com/originals/8a/0c/0a/8a0c0a7b6236565c519acd41ad1a52c0.jpg,8731,image,TheTrendSpotter,Local save in /data/tattoos,"Dr Tattoo,Wörter Tattoos,Pisces Tattoos,Tatoo Art,Dream Tattoos,Dope Tattoos,Mini Tattoos,Finger Tattoos,Body Art Tattoos",20 Koi Fish Tattoos For Lucky Men,ea760f71-febf-4023-b592-d17396659039
quotes,#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself,1,51000.0,https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png,8304,image,Commitment Connection,Local save in /data/quotes,"Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes",The #1 Reason You’re Not His Priority Anymore - Matthew Coast,5b6d0913-25e4-43ab-839d-85d5516f78a4
beauty,"Instantly create the look of lash extensions with this award-winning, best-selling mascara that won't clump, flake or smudge. Available in 3 shades!",1,43000.0,https://i.pinimg.com/videos/thumbnails/originals/69/84/e2/6984e20f3e262098fa9c0614c3453254.0000001.jpg,1313,video,Thrive Causemetics,Local save in /data/beauty,,Liquid Lash Extensions Mascara,44662045-e891-4821-8a19-ebe7eedd371a
vehicles,Nissan GT-R. Sick.,1,437.0,https://i.pinimg.com/originals/0d/29/9f/0d299f3df020395aa7ce8387f40fbeed.jpg,10794,image,Ray Uyemura,Local save in /data/vehicles,"Lowrider,Old Vintage Cars,Antique Cars,Austin Martin,Nissan Gtr Black,Jaguar,1959 Cadillac,Cadillac Ct6,Old School Cars",TireBuyer,c4bd2577-a7bb-4409-bb7a-17d5ed7e1cf1
mens-fashion,,0,,,7528,multi-video(story page format),,Local save in /data/mens-fashion,,,fbe53c66-3442-4773-b19e-d3ec6f54dddf


ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
4315,21b59ba9-829d-4c33-8c27-4cd4c56d26b8,Podcasts for Teachers or Parents of Teenagers,"Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng…",25000.0,Math Giraffe,"Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom",image,https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg,/data/education,education
5730,1e1f0c8b-9fcf-460b-9154-c775827206eb,Island Oasis Coupon Organizer,"Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the ""basic"" couponer - holds up to 500 coupons with ease, and is made long enough so that you…",0.0,Consuelo Aguirre,"Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons",image,https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg,/data/finance,finance
5730,1e1f0c8b-9fcf-460b-9154-c775827206eb,Island Oasis Coupon Organizer,"Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the ""basic"" couponer - holds up to 500 coupons with ease, and is made long enough so that you…",0.0,Consuelo Aguirre,"Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons",image,https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg,/data/finance,finance
2863,9bf39437-42a6-4f02-99a0-9a0383d8cd70,25 Super Fun Summer Crafts for Kids - Of Life and Lisa,Keep the kids busy this summer with these easy diy crafts and projects. Creative and…,124000.0,Of Life & Lisa | Lifestyle Blog,"Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls",image,https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg,/data/diy-and-crafts,diy-and-crafts
2863,9bf39437-42a6-4f02-99a0-9a0383d8cd70,25 Super Fun Summer Crafts for Kids - Of Life and Lisa,Keep the kids busy this summer with these easy diy crafts and projects. Creative and…,124000.0,Of Life & Lisa | Lifestyle Blog,"Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls",image,https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg,/data/diy-and-crafts,diy-and-crafts
8731,ea760f71-febf-4023-b592-d17396659039,20 Koi Fish Tattoos For Lucky Men,"Koi fish tattoos are a popular choice for men who want to make a statement, thanks to their rich symbolism and bold design.",211000.0,TheTrendSpotter,"Dr Tattoo,Wörter Tattoos,Pisces Tattoos,Tatoo Art,Dream Tattoos,Dope Tattoos,Mini Tattoos,Finger Tattoos,Body Art Tattoos",image,https://i.pinimg.com/originals/8a/0c/0a/8a0c0a7b6236565c519acd41ad1a52c0.jpg,/data/tattoos,tattoos
8304,5b6d0913-25e4-43ab-839d-85d5516f78a4,The #1 Reason You’re Not His Priority Anymore - Matthew Coast,#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself,51000.0,Commitment Connection,"Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes",image,https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png,/data/quotes,quotes
1313,44662045-e891-4821-8a19-ebe7eedd371a,Liquid Lash Extensions Mascara,"Instantly create the look of lash extensions with this award-winning, best-selling mascara that won't clump, flake or smudge. Available in 3 shades!",43000.0,Thrive Causemetics,,video,https://i.pinimg.com/videos/thumbnails/originals/69/84/e2/6984e20f3e262098fa9c0614c3453254.0000001.jpg,/data/beauty,beauty
10794,c4bd2577-a7bb-4409-bb7a-17d5ed7e1cf1,TireBuyer,Nissan GT-R. Sick.,437.0,Ray Uyemura,"Lowrider,Old Vintage Cars,Antique Cars,Austin Martin,Nissan Gtr Black,Jaguar,1959 Cadillac,Cadillac Ct6,Old School Cars",image,https://i.pinimg.com/originals/0d/29/9f/0d299f3df020395aa7ce8387f40fbeed.jpg,/data/vehicles,vehicles
7528,fbe53c66-3442-4773-b19e-d3ec6f54dddf,,,,,,multi-video(story page format),,/data/mens-fashion,mens-fashion


In [0]:
#Clean the df_geo dataframe

#Create a new column coordinates that contains an array based on the latitude and longitude columns
#Drop the latitude and longitude columns from the DataFrame
#Reorder the DataFrame columns:
df_geo = df_geo.select('ind', 'country', F.array('latitude', 'longitude').alias('coordinates'), 'timestamp')

#Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn('timestamp', F.regexp_replace('timestamp', 'T',' '))
df_geo = df_geo.withColumn('timestamp', F.to_timestamp('timestamp'))

#View the final table and data types
display(df_geo)
df_geo.printSchema()


ind,country,coordinates,timestamp
10794,Cocos (Keeling) Islands,"List(-89.5236, -154.567)",2022-01-01T02:26:50.000+0000
8304,French Guiana,"List(-28.8852, -164.87)",2019-09-13T04:50:29.000+0000
8304,French Guiana,"List(-28.8852, -164.87)",2019-09-13T04:50:29.000+0000
4315,Cote d'Ivoire,"List(-45.8508, 66.1003)",2019-12-15T03:51:28.000+0000
7528,Albania,"List(-89.9787, -173.293)",2020-08-28T03:52:47.000+0000
2863,Armenia,"List(-5.34445, -177.924)",2020-04-27T13:34:16.000+0000
5730,Colombia,"List(-77.015, -101.437)",2021-04-19T17:37:03.000+0000
7528,Albania,"List(-89.9787, -173.293)",2020-08-28T03:52:47.000+0000
2863,Armenia,"List(-5.34445, -177.924)",2020-04-27T13:34:16.000+0000
5730,Colombia,"List(-77.015, -101.437)",2021-04-19T17:37:03.000+0000


In [0]:
# Read in user table
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pinterest-bucket/topics/12d6e5017cf5.user/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_user)

age,date_joined,first_name,ind,last_name
21,2015-11-10T09:27:42,Andrea,8731,Alexander
36,2015-12-20T16:38:13,Michelle,4315,Prince
32,2016-04-02T03:51:23,Brittany,1313,Jones
25,2015-12-28T04:21:39,Charles,8304,Berry
32,2016-10-23T14:06:51,Dylan,2863,Holmes
36,2015-12-08T20:02:43,Rachel,5730,Davis
20,2015-10-24T11:23:51,Abigail,7528,Ali


In [0]:
#Clean the df_user dataframe

#Create a new column user_name that concatenates the information found in the first_name and last_name columns. And drop the first_name and last_name columns.
df_user = df_user.select('ind', F.concat('first_name', 'last_name').alias('user_name'), 'age', 'date_joined')

#Convert the date_joined column from a string to a timestamp data type
df_user = df_user.withColumn('date_joined', F.to_timestamp('date_joined'))

df_user.show()

In [0]:
#Data queries
from pyspark.sql.window import Window

#1 Find the most popular Pinterest category people post to based on their country.
#Create a combined df containing country and category information
combined_df = df_geo.join(df_pin, df_pin["ind"]==df_geo["ind"], how="inner").sort("country")

#create a window 
window_spec = Window.partitionBy("country").orderBy("category")

#Count the occurrance of each category within each country
query_df = combined_df.withColumn("category_count", F.count("category").over(window_spec))

#Select the columns we want and by category_count so that the first row for each country shows the highest category count
query_df = query_df.select("country", "category", "category_count").orderBy("category_count", ascending=False).distinct()

#Select only the highest category_count for each country (i.e. the first row in each case)
most_popular_pin_per_country_df = query_df.withColumn("row_number", F.row_number().over(window_spec)).where(F.col("row_number") == 1).select("country", "category", "category_count")
most_popular_pin_per_country_df.show(truncate=False)





In [0]:
#2. What is the most popular post each year
#Find how many posts each category had between 2018 and 2022.

#Combine the dataframes containing information about the post, and about the date of posting
combined_df = df_geo.join(df_pin, df_pin["ind"]==df_geo["ind"], how="inner").sort("timestamp")

#Extract only the year from the timestamp information
combined_df = combined_df.select("category", F.regexp_extract("timestamp", '(\\d+)', 1).cast("int").alias("post_year"))

#Count the number of posts within each category, aggregated by year. 
most_popular_pin_per_year_df = combined_df.groupBy("post_year", "category").agg(F.count("category").alias("category_count")).orderBy("post_year", F.desc("category_count"))

most_popular_pin_per_year_df.display()

post_year,category,category_count
2018,beauty,1
2019,quotes,2
2019,education,1
2020,mens-fashion,4
2020,diy-and-crafts,4
2020,tattoos,1
2021,finance,4
2022,vehicles,1


In [0]:
#3.1 Find the user with the most followers in each country

#Combine the relevant dataframes
combined_df = df_geo.join(df_pin, df_pin["ind"]==df_geo["ind"])

#Create a window to partition the df by
window_spec = Window.partitionBy("country").orderBy("poster_name")
query_df = combined_df.withColumn("follower_count", F.sum("follower_count").over(window_spec))

#Select only the highest category_count for each country (i.e. the first row in each case)
result_df = query_df.withColumn("row_number", F.row_number().over(window_spec)).where(F.col("row_number") == 1).select("country", "poster_name", "follower_count")
result_df.show(truncate=False)


#3.2: Based on the above query, find the country with the user with most followers.
max_follower_df = result_df.orderBy("follower_count", ascending=False).select("country", "follower_count")
max_follower_df.show(1)



In [0]:
#4. What is the most popular category by age? 

#Combine dataframes
combined_df = df_user.join(df_pin, df_pin["ind"]==df_user["ind"], how="inner")

#Create an age_group column containing the age categories
query_df = combined_df.withColumn('age_group', F.when(F.col('age').between(18,24), '18-24')\
.when(F.col('age').between(25,35), '25-35')\
.when(F.col('age').between(36,50), '36-50')\
.when(F.col('age')>50, '+50'))\
.select('age_group', 'age', 'category')
query_df.display(truncate=False)

#Count the instances of each category within each age group
window_spec = Window.partitionBy("age_group", "category")
most_popular_by_age_df = query_df.withColumn("category_count", F.count("category").over(window_spec))
most_popular_by_age_df.dropDuplicates(["age_group", "category"]).select("age_group", "category", "category_count").orderBy("age_group").display(truncate=False)


age_group,age,category
36-50,36,education
36-50,36,finance
36-50,36,finance
25-35,32,diy-and-crafts
25-35,32,diy-and-crafts
18-24,21,tattoos
25-35,25,quotes
25-35,32,beauty
18-24,20,mens-fashion
18-24,20,mens-fashion


age_group,category,category_count
18-24,tattoos,1
18-24,mens-fashion,2
25-35,diy-and-crafts,2
25-35,quotes,1
25-35,beauty,1
36-50,education,1
36-50,finance,2


In [0]:
#5. What is the median follower count for users in the following age groups:

#Combine dataframes
combined_df = df_user.join(df_pin, df_pin["ind"]==df_user["ind"], how="inner")

#Create an age_group column containing the age categories
query_df = combined_df.withColumn('age_group', F.when(F.col('age').between(18,24), '18-24')\
.when(F.col('age').between(25,35), '25-35')\
.when(F.col('age').between(36,50), '36-50')\
.when(F.col('age')>50, '+50'))\
.select('age_group', 'follower_count')
query_df.display(truncate=False)

#Create a window to estimate follower_count by age_group
window_spec = Window.partitionBy("age_group")
#function to estimate the median follower count
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')
#estimate the median follower count per age group
median_followers_by_age_df = query_df.withColumn('median_follower_count', magic_percentile.over(window_spec)).dropDuplicates(["age_group", "median_follower_count"]).select("age_group", "median_follower_count")
median_followers_by_age_df.display()

age_group,follower_count
36-50,25000.0
36-50,0.0
36-50,0.0
25-35,124000.0
25-35,124000.0
18-24,211000.0
25-35,51000.0
25-35,43000.0
18-24,
18-24,


age_group,median_follower_count
18-24,211000
25-35,51000
36-50,0


In [0]:
#6. How many users joined every year? 
#Find how many users have joined between 2015 and 2020.

#All the information is in the df_user table
query_df = df_user.withColumn("year_joined", F.regexp_extract('date_joined', '(\d+)', 1)).select("year_joined")
window_spec = Window.partitionBy("year_joined")
users_joined_per_year_df = query_df.withColumn("number_users_joined", F.count("year_joined").over(window_spec)).dropDuplicates()
users_joined_per_year_df.display()

year_joined,number_users_joined
2015,5
2016,2


In [0]:
#7. Find the median follower count of users have joined between 2015 and 2020.

#Your query should return a DataFrame that contains the following columns:
#post_year, a new column that contains only the year from the timestamp column
#median_follower_count, a new column containing the desired query output
combined_df = df_user.join(df_pin, ["ind"]).select("date_joined", "follower_count", "user_name")

#Extract the year the user joined
user_joined_df = combined_df.withColumn("year_joined", F.regexp_extract('date_joined', '(\d+)', 1))

#Calculate the median followers for each user
window_spec = Window.partitionBy("user_name")
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')
median_followers_df = user_joined_df.withColumn("median_followers_per_user", magic_percentile.over(window_spec)).dropDuplicates()

#Calculate the median followers for users that joined in each year
window_spec = Window.partitionBy("year_joined")
result_df = median_followers_df.withColumn("median_follower_count", magic_percentile.over(window_spec)).select("year_joined", "median_follower_count").dropDuplicates()
result_df.display()

year_joined,median_follower_count
2015,25000
2016,43000


In [0]:
#8. Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.

#age_group, a new column based on the original age column
#join_year, a new column that contains only the year from the timestamp column
#median_follower_count, a new column containing the desired query output

#Combine dataframes
combined_df = df_user.join(df_pin, df_pin["ind"]==df_user["ind"], how="inner")

#Create an age_group column containing the age categories
query_df = combined_df.withColumn('age_group', F.when(F.col('age').between(18,24), '18-24')\
.when(F.col('age').between(25,35), '25-35')\
.when(F.col('age').between(36,50), '36-50')\
.when(F.col('age')>50, '+50'))

#Extract the year the user joined to a new column
user_joined_df = query_df.withColumn("year_joined", F.regexp_extract('date_joined', '(\d+)', 1)).select("age_group", "year_joined", "follower_count", "user_name")
user_joined_df.display()

#Create a window to partition the data by age and year_joined
window_spec = Window.partitionBy("year_joined", "age_group")
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')

#Calculate the median followers per age group within each year
median_followers_df = user_joined_df.withColumn("median_follower_count", magic_percentile.over(window_spec)).select("year_joined", "age_group", "median_follower_count").dropDuplicates()
#.dropDuplicates()
#result_df = user_joined_df.withColumn("median_follower_count", magic_percentile.over(window_spec))
median_followers_df.display()

age_group,year_joined,follower_count,user_name
36-50,2015,25000.0,MichellePrince
36-50,2015,0.0,RachelDavis
36-50,2015,0.0,RachelDavis
25-35,2016,124000.0,DylanHolmes
25-35,2016,124000.0,DylanHolmes
18-24,2015,211000.0,AndreaAlexander
25-35,2015,51000.0,CharlesBerry
25-35,2016,43000.0,BrittanyJones
18-24,2015,,AbigailAli
18-24,2015,,AbigailAli


year_joined,age_group,median_follower_count
2015,18-24,211000
2015,25-35,51000
2015,36-50,0
2016,25-35,124000


In [0]:
#Unmount the S3 bucket
dbutils.fs.unmount("/mnt/pinterest-bucket")