# Reading and cleaning Pinterest Data from S3 bucket using Sparks

In [None]:
%run "/Users/amysw13@gmail.com/Mount S3 bucket to Databricks"

In [None]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Check mounted s3 bucket

In [None]:
display(dbutils.fs.ls("/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/"))

path,name,size,modificationTime
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000000.json,124714cdee67.geo+0+0000000000.json,108,1704214484000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000001.json,124714cdee67.geo+0+0000000001.json,113,1704214499000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000002.json,124714cdee67.geo+0+0000000002.json,107,1704214513000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000003.json,124714cdee67.geo+0+0000000003.json,113,1704214515000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000004.json,124714cdee67.geo+0+0000000004.json,125,1704214516000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000005.json,124714cdee67.geo+0+0000000005.json,109,1704214517000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000006.json,124714cdee67.geo+0+0000000006.json,111,1704214519000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000007.json,124714cdee67.geo+0+0000000007.json,114,1704214520000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000008.json,124714cdee67.geo+0+0000000008.json,108,1704214521000
dbfs:/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+0000000009.json,124714cdee67.geo+0+0000000009.json,109,1704214523000


### Set spark databricks to not check for delta formats

In [None]:
%sql
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


### Reading in mounted s3 bucket data

Each table read into three seperate dataframes (pin, geo and user). 

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
pin_file_location = "/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.pin/partition=0/124714cdee67.pin+0+*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
pin_df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(pin_file_location)
# Display Spark dataframe to check its content
display(pin_df)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d
christmas,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",1,31k,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,2604,image,Everyday Wholesome,Local save in /data/christmas,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,087b0fa9-f901-4262-aa0a-6caf234d1b35
christmas,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",1,31k,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,2604,image,Everyday Wholesome,Local save in /data/christmas,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,087b0fa9-f901-4262-aa0a-6caf234d1b35
diy-and-crafts,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",1,892k,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,3156,image,Michelle {CraftyMorning.com},Local save in /data/diy-and-crafts,"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",Handprint Reindeer Ornaments - Crafty Morning,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4
diy-and-crafts,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",1,892k,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,3156,image,Michelle {CraftyMorning.com},Local save in /data/diy-and-crafts,"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",Handprint Reindeer Ornaments - Crafty Morning,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4
diy-and-crafts,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",1,892k,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,3156,image,Michelle {CraftyMorning.com},Local save in /data/diy-and-crafts,"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",Handprint Reindeer Ornaments - Crafty Morning,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4


In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
geo_file_location = "/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.geo/partition=0/124714cdee67.geo+0+*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
geo_df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(geo_file_location)
# Display Spark dataframe to check its content
display(geo_df)

country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27 11:30:59
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27 11:30:59
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27 11:30:59
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01 02:26:50
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01 02:26:50
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01 02:26:50
Central African Republic,2074,-52.3213,-50.11,2019-11-03 05:41:59
British Virgin Islands,2293,-87.7946,-159.647,2022-03-21 10:46:53
Central African Republic,2074,-52.3213,-50.11,2019-11-03 05:41:59
Central African Republic,2074,-52.3213,-50.11,2019-11-03 05:41:59


In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
user_file_location = "/mnt/s3_pin_bucket_124714cdee67/topics/124714cdee67.user/partition=0/124714cdee67.user+0+*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
user_df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(user_file_location)
# Display Spark dataframe to check its content
display(user_df)

age,date_joined,first_name,ind,last_name
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
27,2016-03-08 13:38:37,Christopher,2015,Bradshaw
39,2016-06-29 20:43:59,Christina,6398,Davenport
20,2015-10-23 04:13:23,Alexandria,3599,Alvarado
39,2016-06-29 20:43:59,Christina,6398,Davenport
20,2015-10-23 04:13:23,Alexandria,3599,Alvarado
20,2015-12-01 15:08:31,Christopher,5076,Butler
39,2017-07-19 07:12:04,Michelle,7790,Gutierrez
49,2016-04-22 20:36:02,Brittany,10509,Thompson
20,2015-12-01 15:08:31,Christopher,5076,Butler


### Unmount S3 bucket from Databricks

In [None]:
# Mount name for the bucket
#MOUNT_NAME = "/mnt/s3_pin_bucket_124714cdee67"
# to unmount
#dbutils.fs.unmount(MOUNT_NAME)

## Cleaning Pinterest Post Data
To clean the df_pin DataFrame you should perform the following transformations:

1. Replace empty entries and entries with no relevant data in each column with Nones
2. Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type of this column is an int.
3. Ensure that each column containing numeric data has a numeric data type
4. Clean the data in the save_location column to include only the save location path
5. Rename the index column to ind.
6. Reorder the DataFrame columns to have the following column order:
- ind
- unique_id
- title
- description
- follower_count
- poster_name
- tag_list
- is_image_or_video
- image_src
- save_location
- category

In [None]:
# drop dulpicate rows
pin_df = pin_df.dropDuplicates()

In [None]:
#check number of rows after dropping duplicates
row = pin_df.count()
print(f'Number of Rows are: {row}')
# 204 to 88 rows after dropping duplicate rows produced from streaming data multiple times during testing

In [None]:
display(pin_df)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
quotes,summcoco gives you inspiration for the women fashion trends you want. Thinking about a new look or lifestyle? This is your ultimate resource to get the hottest trends. 45 Top Li…,1,306k,https://i.pinimg.com/originals/bb/c0/e6/bbc0e6a797079505f11ac12bcb0b8c66.jpg,7922,image,"Sumcoco | Decor Ideas, Hairstyles, Nails Fashion Advice",Local save in /data/quotes,"Life Quotes Love,Inspirational Quotes About Love,Mood Quotes,Motivational Quotes,Tears Quotes,Quotes About Sadness,Deep Quotes About Life,Quotes Quotes,Quote Life",45 Top Life Quotes School Did Not Teach You,a584581c-1b38-4731-a1cc-f36115ecf229
event-planning,"Personalize your event or shop with a customized neon sign. Make a statement with your own custom vibes! This light is 32 -40 inches (80cm-100cm) if you need something bigger, p…",1,111,https://i.pinimg.com/originals/e9/c0/7c/e9c07cf0cf16cab23764a36718ab76c1.jpg,4508,image,Life of Neon | Custom Neon Light Signs | Home Decor Wall Art,Local save in /data/event-planning,"Our Wedding,Wedding Venues,Dream Wedding,Wedding Cakes,Church Wedding,Wedding Flowers,Lace Wedding,Wedding Rings,Wedding Dresses",Custom Event and Shop Neon Sign Lights - Event & Shop,9064f4a2-2753-476c-815e-db360f45a93e
christmas,Here are the best DIY Christmas Centerpieces ideas perfect for your Christmas & holiday season home decor. From Christmas Vignettes to Table Centerpieces.,1,500k,https://i.pinimg.com/originals/aa/6d/0f/aa6d0f44d7c1c96b998cb9aa6c4446b8.png,2418,image,HikenDip,Local save in /data/christmas,"Farmhouse Christmas Decor,Rustic Christmas,Christmas Time,Vintage Christmas,Xmas,Primitive Christmas Crafts,Christmas Vignette,Indoor Christmas Decorations,Diy Christmas Ornaments",100 DIY Christmas Centerpieces You'll Love To Decorate Your Home With For The Christmas Season - Hike n Dip,da8745a6-5160-46c4-877d-181d50a729fd
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d
travel,"See families traveling all the time and wonder, ""how the heck do they afford this?"" Read 10 mistakes you might be making, and what you should do instead.",1,9k,https://i.pinimg.com/originals/0a/49/fb/0a49fbcec746c4219d3a6f30834f378e.jpg,10119,image,OUR NEXT ADVENTURE | family travel blog,Local save in /data/travel,"Family Vacation Destinations,Vacation Trips,Travel Destinations,Vacation Ideas,Cheap Family Vacations,Vacation Travel,Best Family Vacation Spots,Vacation Quotes,Vacation Memories",How to Afford Family Travel: 10 Mistakes You're Making (and what to do instead) | Our Next Adventure,40eab9ba-7812-4f26-baca-35a6bed95a9f
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
art,"Use your mini world figures to create this beautiful African sunset. Your kids will love learning about shadows, angles and distortion in this fun art and STEM activity for kids.",1,4k,https://i.pinimg.com/originals/e3/aa/35/e3aa350f8f104d0e59f26d7f17ea7461.png,771,image,Taming Little Monsters - Fun Activities for Kids,Local save in /data/art,"African Art Projects,Cool Art Projects,Projects For Kids,African Art For Kids,African Crafts Kids,Art Club Projects,Art Education Projects,Tracing Art,African Sunset",African Sunset Shadow Tracing Art - Taming Little Monsters,a5021766-a8aa-4dc7-9857-4da6b8e3dc1a
quotes,Trying to create your dream life but don't know where to start?! These vision board ideas are a great way to manifest a new you this year,1,42k,https://i.pinimg.com/originals/36/d1/be/36d1be632cbf9b6c8e377a052b31d064.jpg,8312,image,TheFab20s | Travel+Food+DIY+Listicles,Local save in /data/quotes,"Positive Self Affirmations,Positive Affirmations Quotes,Affirmation Quotes,Quotes Positive,Motivational Quotes For Success Positivity,Business Success Quotes,Positive Vibes,Motivational Quotes For Women,Affirmations For Love",8 Vision Board Ideas To Manifest Your Dreams - TheFab20s,ca3c9bb0-7281-4b9b-8abf-201da0b68d62
education,"Hi everyone! As a teacher using the Orton-Gillingham approach, I am constantly looking for phonics activities that my students will find fun and engaging. Using Orton-Gillingham…",1,22k,https://i.pinimg.com/originals/58/8e/38/588e380b19942a71a86a69d9c9973d25.png,4076,image,The Literacy Nest,Local save in /data/education,"Literacy Games,Kindergarten Activities,Literacy Centers,Fun Phonics Activities,Listening Activities,Vocabulary Games,Literacy Stations,Letter Activities,Montessori Activities",Phonics Activities Your Kids Will Love - The Literacy Nest,3a52d364-7c04-47cb-a3e5-56d9e2b77528
christmas,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",1,31k,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,2604,image,Everyday Wholesome,Local save in /data/christmas,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,087b0fa9-f901-4262-aa0a-6caf234d1b35


In [None]:
# Replacing missing entries and irrelevant data with None
clean_pin_df = pin_df.replace({'User Info Error': None}, subset=['follower_count'])
clean_pin_df = clean_pin_df.replace({'No description available Story format': None}, subset=['description'])
clean_pin_df = clean_pin_df.replace({'Image src error.': None}, subset=['image_src'])
clean_pin_df = clean_pin_df.replace({'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e': None}, subset=['tag_list'])
clean_pin_df = clean_pin_df.replace({'No Title Data Available': None}, subset=['title'])

In [None]:
# Cast 'follower_count' to integer data type, but first convert any "k" and "M" to number
clean_pin_df = clean_pin_df.withColumn(
    "follower_count",
    when(
        col("follower_count").contains("k"),
        regexp_extract(col("follower_count"), "(\d+(.\d+)?)", 1).cast(DoubleType())
        * 1000
    )
    .when(
        col("follower_count").contains("M"),
        regexp_extract(col("follower_count"), "(\d+(.\d+)?)", 1).cast(DoubleType())
        * 1000000
    )
    .otherwise(regexp_extract(col("follower_count"), "(\d+(.\d+)?)", 1).cast("integer"))
    .cast("integer")
)

In [None]:
display(clean_pin_df)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
event-planning,"Personalize your event or shop with a customized neon sign. Make a statement with your own custom vibes! This light is 32 -40 inches (80cm-100cm) if you need something bigger, p…",1,111.0,https://i.pinimg.com/originals/e9/c0/7c/e9c07cf0cf16cab23764a36718ab76c1.jpg,4508,image,Life of Neon | Custom Neon Light Signs | Home Decor Wall Art,/data/event-planning,"Our Wedding,Wedding Venues,Dream Wedding,Wedding Cakes,Church Wedding,Wedding Flowers,Lace Wedding,Wedding Rings,Wedding Dresses",Custom Event and Shop Neon Sign Lights - Event & Shop,9064f4a2-2753-476c-815e-db360f45a93e
travel,"Are you traveling to Paris during the summer? Find out what to do in Paris, France during the summer. Fun summertime activities in Paris. Enjoy the incredible outdoors when trav…",1,3000.0,https://i.pinimg.com/originals/6c/4c/90/6c4c90bba27ebf8c8bfe4c1acfb9f07a.jpg,9979,image,Petite in Paris,/data/travel,"Torre Eiffel Paris,Tour Eiffel,Picnic In Paris,Hello France,Voyage Europe,Destination Voyage,Beautiful Places To Travel,Travel Aesthetic,Paris Travel",Paris in the Summer. 10 fun things to do in Paris in the Summertime • Petite in Paris,2b2abc85-fc51-481f-8ae6-17681993da28
art,This bee directed drawing and associated pages will help you create a fun and creative Valentine's Day Directed Drawing Art Project activity for your class.Choose to do a painti…,1,1000000.0,https://i.pinimg.com/originals/49/ff/2e/49ff2e83c0cefdd37213f6084c6f0566.jpg,159,image,Teachers Pay Teachers,/data/art,"Classroom Art Projects,School Art Projects,Art Classroom,Art Projects For Kindergarteners,Spring Art Projects,Classroom Posters,Valentines Art Lessons,Valentines Day Activities,Grade 1 Art",Valentine's Day Bee Directed Drawing {Art Project},841a161a-47b8-4161-884d-adeb67a28b1e
education,"Hi everyone! As a teacher using the Orton-Gillingham approach, I am constantly looking for phonics activities that my students will find fun and engaging. Using Orton-Gillingham…",1,22000.0,https://i.pinimg.com/originals/58/8e/38/588e380b19942a71a86a69d9c9973d25.png,4076,image,The Literacy Nest,/data/education,"Literacy Games,Kindergarten Activities,Literacy Centers,Fun Phonics Activities,Listening Activities,Vocabulary Games,Literacy Stations,Letter Activities,Montessori Activities",Phonics Activities Your Kids Will Love - The Literacy Nest,3a52d364-7c04-47cb-a3e5-56d9e2b77528
travel,"Although you'd think Greek islands are fairly similar, you'd be completely wrong! Each island has it's own personality and appeal. Here are the 8 Best Greek Islands to visit, es…",1,42000.0,https://i.pinimg.com/originals/06/1d/ce/061dce38929dec8e74844442116bea4a.jpg,9759,image,TheFab20s | Travel+Food+DIY+Listicles,/data/travel,"Greek Islands To Visit,Best Greek Islands,Greece Islands,Cool Places To Visit,Places To Go,Best Places In Portugal,Copenhagen Travel,Paros Island,Santorini Island",8 Best Greek Islands You Have To Visit - TheFab20s,d105eb6e-0f9f-46e7-8d02-d24b62f6ae90
education,"Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng…",1,25000.0,https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg,4315,image,Math Giraffe,/data/education,"Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom",Podcasts for Teachers or Parents of Teenagers,21b59ba9-829d-4c33-8c27-4cd4c56d26b8
education,"This book presents 10 inspirational case studies of how centre leaders, principals and leadership teams in high-needs New Zealand educational settings have enacted leadership to…",1,2000000.0,https://i.pinimg.com/originals/0e/ea/c4/0eeac457780bbe43fcc5e9eaabd80f62.jpg,3599,image,Walmart,/data/education,"Research Studies,Educational Leadership,Education System,Secondary School,Try It Free,Book Format,Social Justice,Young People,Paperback Books",Educational Leadership in Aotearoa New Zealand : Issues of Context and Social Justice,ff0dd945-dafa-411c-8cef-eb43e374e815
art,"Easy to follow steps for this easy DIY wall art. Acrylic Paint Pouring with a blow dryer, Dutch paint pour. Written steps and full step by step video to help you do it too! 👍",1,52000.0,https://i.pinimg.com/videos/thumbnails/originals/d5/5e/fa/d55efa6ef50d35dac425cea935f39c89.0000001.jpg,427,video,Abbotts At Home,/data/art,"Acrylic Pouring Art,Acrylic Wall Art,Acrylic Pouring Techniques,Acrylic Paintings,Art Paintings,Marble Art,Marble Painting,Pour Painting,Large Canvas Art",DIY Acrylic Paint Pouring Wall Art - Abbotts At Home,4a455340-09a2-4370-ad86-73d7964603db
diy-and-crafts,"DIY Dollar Store Valentine’s Day Heart Wreath Decoration. There are so many great Valentine crafts at the Dollar Tree right now! During our last trip, we picked up a few supplie…",1,9000.0,https://i.pinimg.com/originals/dc/f6/8a/dcf68adcc63c339c24fa5664f1115994.png,2698,image,South Lumina Style,/data/diy-and-crafts,"Valentine Day Wreaths,Valentines Day Hearts,Valentines Day Decorations,Valentine Day Crafts,Holiday Crafts,Diy Christmas,Christmas Wreaths,Valentine Tree,Printable Valentine",DIY Dollar Tree Valentine's Day Wreath,73f16302-4871-486a-8836-947530526337
christmas,Christmas Trees From Pallet Wood | Holiday DIY: Deck the yard with some fun outdoor Christmas Trees! We made these merry and bright decorations from two old pallets we had lying…,1,3000000.0,https://i.pinimg.com/originals/64/7b/ca/647bca35169b7c144604116c64bcba8a.png,1704,image,Instructables,/data/christmas,"Pallet Wood Christmas Tree,Wooden Christmas Crafts,Diy Christmas Tree,Christmas Projects,Holiday Crafts,Wooden Xmas Trees,Different Christmas Trees,Pallet Tree,Christmas Kitchen",Christmas Trees From Pallet Wood | Holiday DIY,5fbf9863-fb79-477c-a5b6-540c3020a55f


In [None]:
# cast any numeric columns to a numeric data type 
clean_pin_df = clean_pin_df.withColumn("downloaded", clean_pin_df["downloaded"].cast("integer")) \
      .withColumn("index", clean_pin_df["index"].cast("integer"))

In [None]:
# clean save location column to contain only relative path
clean_pin_df = clean_pin_df.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

In [None]:
clean_pin_df = clean_pin_df.withColumnRenamed("index", "ind")

In [None]:
# reorder columns of cleaned pinterest data dataframe
clean_pin_df = clean_pin_df.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")

clean_pin_df.printSchema()

## Cleaning geolocation data

1. Create a new column coordinates that contains an array based on the latitude and longitude columns
2. Drop the latitude and longitude columns from the DataFrame
3. Convert the timestamp column from a string to a timestamp data type
4. Reorder the DataFrame columns to have the following column order:
 - ind
 - country
 - coordinates
 - timestamp

In [None]:
# drop dulpicate rows
geo_df = geo_df.dropDuplicates()

In [None]:
#check number of rows after dropping duplicates
row = geo_df.count()
print(f'Number of Rows are: {row}')
# 202 to 88 rows after dropping duplicate rows produced from streaming data multiple times during testing

In [None]:
display(geo_df)

country,ind,latitude,longitude,timestamp
Antigua and Barbuda,7922,-88.0974,-172.052,2021-01-27 09:14:19
British Virgin Islands,2293,-87.7946,-159.647,2022-03-21 10:46:53
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27 11:30:59
Cocos (Keeling) Islands,10794,-89.5236,-154.567,2022-01-01 02:26:50
Antigua and Barbuda,2604,-80.8933,-104.972,2018-12-01 09:23:35
Central African Republic,2074,-52.3213,-50.11,2019-11-03 05:41:59
Netherlands Antilles,603,14.0083,-141.603,2019-06-25 05:13:01
Antigua and Barbuda,8606,-88.0974,-172.052,2021-03-28 14:54:07
Saint Kitts and Nevis,10663,-27.3474,-162.83,2019-07-25 18:53:51
Gibraltar,10509,-67.187,-24.2977,2019-03-06 12:21:56


In [None]:
clean_geo_df = geo_df \
    .withColumn("coordinates", array("latitude", "longitude")) \
    .drop("latitude", "longitude") \
    .withColumn("timestamp", col("timestamp").cast(TimestampType())) \
    .select("ind", "country", "coordinates", "timestamp")        

In [None]:
display(clean_geo_df)

ind,country,coordinates,timestamp
6145,Mozambique,"List(-65.9079, -143.845)",2019-12-05T02:09:44.000+0000
4508,Philippines,"List(69.1858, -76.0761)",2019-01-04T11:15:27.000+0000
9759,American Samoa,"List(-77.9744, -106.258)",2017-12-30T13:05:49.000+0000
7790,Papua New Guinea,"List(-43.692, 64.9839)",2018-07-31T08:19:15.000+0000
2923,Cote d'Ivoire,"List(-84.6302, -164.507)",2019-09-08T22:53:09.000+0000
427,Isle of Man,"List(-66.9418, -30.0087)",2020-04-22T03:08:50.000+0000
4315,Cote d'Ivoire,"List(-45.8508, 66.1003)",2019-12-15T03:51:28.000+0000
8312,American Samoa,"List(-77.9744, -106.258)",2021-04-25T15:56:29.000+0000
6844,New Caledonia,"List(-22.6915, 5.69245)",2021-06-06T21:53:11.000+0000
3599,Afghanistan,"List(-88.5478, -174.971)",2019-03-03T06:13:41.000+0000


## Cleaning user data

1. Create a new column user_name that concatenates the information found in the first_name and last_name columns
2. Drop the first_name and last_name columns from the DataFrame
3. Convert the date_joined column from a string to a timestamp data type
4. Reorder the DataFrame columns to have the following column order:
 - ind
 - user_name
 - age
 - date_joined

In [None]:
# drop dulpicate rows
user_df = user_df.dropDuplicates()

In [None]:
#check number of rows after dropping duplicates
row = user_df.count()
print(f'Number of Rows are: {row}')
# 205 to 88 rows after dropping duplicate rows produced from streaming data multiple times during testing

In [None]:
display(user_df)

age,date_joined,first_name,ind,last_name
59,2017-06-29 22:35:17,Michael,4137,Decker
23,2015-11-25 13:36:22,Corey,6063,Andrews
32,2016-06-08 22:10:13,Donna,1268,Campbell
20,2015-12-17 08:43:40,Adam,3800,Armstrong
32,2016-03-10 04:11:31,Brittany,771,Butler
21,2016-01-03 15:42:12,Annette,2074,Forbes
26,2015-12-20 10:28:00,Brendan,9875,Joseph
58,2016-06-03 23:35:30,Michael,4508,Carter
34,2016-12-22 00:02:02,Thomas,10794,Turner
54,2016-05-15 04:22:01,Alexis,1555,Bennett


In [None]:
clean_user_df = user_df \
    .withColumn("user_name", concat("first_name",  lit(" "), "last_name")) \
    .drop("first_name", "last_name") \
    .withColumn("date_joined", col("date_joined").cast(TimestampType())) \
    .select("ind", "user_name", "age", "date_joined")

In [None]:
display(clean_user_df)

ind,user_name,age,date_joined
8930,Andrew Anderson,23,2015-11-28T11:52:37.000+0000
4315,Michelle Prince,36,2015-12-20T16:38:13.000+0000
6566,Alexander Perez,31,2017-08-04T14:30:22.000+0000
9672,Jennifer Hudson,22,2016-02-11T20:46:04.000+0000
10794,Thomas Turner,34,2016-12-22T00:02:02.000+0000
10625,Christian Lang,32,2017-10-10T20:09:33.000+0000
2959,David Griffith,20,2016-01-07T19:49:22.000+0000
9074,Aaron Alexander,21,2015-10-25T07:36:08.000+0000
3729,Richard Edwards,52,2016-02-07T20:00:25.000+0000
10552,Michael Hunter,40,2017-05-16T07:09:21.000+0000


## Querying Pinterest Data


Find the most popular Pinterest category people post to based on their country.


Your query should return a DataFrame that contains the following columns:

 - country
 - category
 - category_count, (a new column containing the desired query output)

In [None]:
# Join all cleaned dataframes together by ind
joined_df = clean_pin_df.join(clean_geo_df, ["ind"]) \
            .join(clean_user_df, ["ind"])

In [None]:
display(joined_df)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category,country,coordinates,timestamp,user_name,age,date_joined
7922,a584581c-1b38-4731-a1cc-f36115ecf229,45 Top Life Quotes School Did Not Teach You,summcoco gives you inspiration for the women fashion trends you want. Thinking about a new look or lifestyle? This is your ultimate resource to get the hottest trends. 45 Top Li…,306000.0,"Sumcoco | Decor Ideas, Hairstyles, Nails Fashion Advice","Life Quotes Love,Inspirational Quotes About Love,Mood Quotes,Motivational Quotes,Tears Quotes,Quotes About Sadness,Deep Quotes About Life,Quotes Quotes,Quote Life",image,https://i.pinimg.com/originals/bb/c0/e6/bbc0e6a797079505f11ac12bcb0b8c66.jpg,/data/quotes,quotes,Antigua and Barbuda,"List(-88.0974, -172.052)",2021-01-27T09:14:19.000+0000,Denise Adams,21,2015-11-12T06:21:36.000+0000
4508,9064f4a2-2753-476c-815e-db360f45a93e,Custom Event and Shop Neon Sign Lights - Event & Shop,"Personalize your event or shop with a customized neon sign. Make a statement with your own custom vibes! This light is 32 -40 inches (80cm-100cm) if you need something bigger, p…",111.0,Life of Neon | Custom Neon Light Signs | Home Decor Wall Art,"Our Wedding,Wedding Venues,Dream Wedding,Wedding Cakes,Church Wedding,Wedding Flowers,Lace Wedding,Wedding Rings,Wedding Dresses",image,https://i.pinimg.com/originals/e9/c0/7c/e9c07cf0cf16cab23764a36718ab76c1.jpg,/data/event-planning,event-planning,Philippines,"List(69.1858, -76.0761)",2019-01-04T11:15:27.000+0000,Michael Carter,58,2016-06-03T23:35:30.000+0000
2418,da8745a6-5160-46c4-877d-181d50a729fd,100 DIY Christmas Centerpieces You'll Love To Decorate Your Home With For The Christmas Season - Hike n Dip,Here are the best DIY Christmas Centerpieces ideas perfect for your Christmas & holiday season home decor. From Christmas Vignettes to Table Centerpieces.,500000.0,HikenDip,"Farmhouse Christmas Decor,Rustic Christmas,Christmas Time,Vintage Christmas,Xmas,Primitive Christmas Crafts,Christmas Vignette,Indoor Christmas Decorations,Diy Christmas Ornaments",image,https://i.pinimg.com/originals/aa/6d/0f/aa6d0f44d7c1c96b998cb9aa6c4446b8.png,/data/christmas,christmas,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000,Amanda Adams,20,2015-10-21T08:27:36.000+0000
10138,927c4658-cc3f-4b92-9b5c-70743d0c238d,"14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More","This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",10000.0,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️","Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures",image,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,/data/travel,travel,Austria,"List(-72.142, -74.3545)",2019-08-03T00:59:29.000+0000,Carol Silva,22,2015-12-31T14:57:02.000+0000
10119,40eab9ba-7812-4f26-baca-35a6bed95a9f,How to Afford Family Travel: 10 Mistakes You're Making (and what to do instead) | Our Next Adventure,"See families traveling all the time and wonder, ""how the heck do they afford this?"" Read 10 mistakes you might be making, and what you should do instead.",9000.0,OUR NEXT ADVENTURE | family travel blog,"Family Vacation Destinations,Vacation Trips,Travel Destinations,Vacation Ideas,Cheap Family Vacations,Vacation Travel,Best Family Vacation Spots,Vacation Quotes,Vacation Memories",image,https://i.pinimg.com/originals/0a/49/fb/0a49fbcec746c4219d3a6f30834f378e.jpg,/data/travel,travel,Christmas Island,"List(-74.5431, -162.795)",2020-10-22T01:59:58.000+0000,Chelsea Gonzalez,43,2016-07-21T15:25:08.000+0000
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",46000.0,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,/data/christmas,christmas,Bermuda,"List(63.4563, -164.709)",2019-09-13T08:20:13.000+0000,David Moss,22,2016-03-01T07:11:48.000+0000
771,a5021766-a8aa-4dc7-9857-4da6b8e3dc1a,African Sunset Shadow Tracing Art - Taming Little Monsters,"Use your mini world figures to create this beautiful African sunset. Your kids will love learning about shadows, angles and distortion in this fun art and STEM activity for kids.",4000.0,Taming Little Monsters - Fun Activities for Kids,"African Art Projects,Cool Art Projects,Projects For Kids,African Art For Kids,African Crafts Kids,Art Club Projects,Art Education Projects,Tracing Art,African Sunset",image,https://i.pinimg.com/originals/e3/aa/35/e3aa350f8f104d0e59f26d7f17ea7461.png,/data/art,art,Montserrat,"List(-29.1712, -107.111)",2018-06-21T08:42:57.000+0000,Brittany Butler,32,2016-03-10T04:11:31.000+0000
8312,ca3c9bb0-7281-4b9b-8abf-201da0b68d62,8 Vision Board Ideas To Manifest Your Dreams - TheFab20s,Trying to create your dream life but don't know where to start?! These vision board ideas are a great way to manifest a new you this year,42000.0,TheFab20s | Travel+Food+DIY+Listicles,"Positive Self Affirmations,Positive Affirmations Quotes,Affirmation Quotes,Quotes Positive,Motivational Quotes For Success Positivity,Business Success Quotes,Positive Vibes,Motivational Quotes For Women,Affirmations For Love",image,https://i.pinimg.com/originals/36/d1/be/36d1be632cbf9b6c8e377a052b31d064.jpg,/data/quotes,quotes,American Samoa,"List(-77.9744, -106.258)",2021-04-25T15:56:29.000+0000,Daniel Brooks,25,2015-11-19T21:24:33.000+0000
4076,3a52d364-7c04-47cb-a3e5-56d9e2b77528,Phonics Activities Your Kids Will Love - The Literacy Nest,"Hi everyone! As a teacher using the Orton-Gillingham approach, I am constantly looking for phonics activities that my students will find fun and engaging. Using Orton-Gillingham…",22000.0,The Literacy Nest,"Literacy Games,Kindergarten Activities,Literacy Centers,Fun Phonics Activities,Listening Activities,Vocabulary Games,Literacy Stations,Letter Activities,Montessori Activities",image,https://i.pinimg.com/originals/58/8e/38/588e380b19942a71a86a69d9c9973d25.png,/data/education,education,Mauritania,"List(-67.2157, 27.8139)",2019-06-07T20:13:50.000+0000,Larry Pineda,20,2015-10-23T22:47:39.000+0000
2604,087b0fa9-f901-4262-aa0a-6caf234d1b35,75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",31000.0,Everyday Wholesome,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",image,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,/data/christmas,christmas,Antigua and Barbuda,"List(-80.8933, -104.972)",2018-12-01T09:23:35.000+0000,Ashley Evans,30,2016-02-21T12:54:01.000+0000


Find the most popular Pinterest category people post to based on their country.


Your query should return a DataFrame that contains the following columns:

 - country
 - category
 - category_count, (a new column containing the desired query output)

In [None]:
popular_category = joined_df.groupBy("country", "category").agg(count("category").alias("category_count")).orderBy("category_count", ascending=False)
display(popular_category)

country,category,category_count
Isle of Man,art,2
Austria,travel,2
Australia,mens-fashion,2
Armenia,diy-and-crafts,2
India,travel,1
Central African Republic,christmas,1
French Guiana,quotes,1
Cambodia,diy-and-crafts,1
Madagascar,event-planning,1
Mozambique,home-decor,1


Find how many posts each category had between 2018 and 2022.

Your query should return a DataFrame that contains the following columns:

 - post_year, a new column that contains only the year from the timestamp column
 - category
 - category_count, a new column containing the desired query output

In [None]:
category_num_post = joined_df \
    .withColumn("post_year", year('timestamp')) \
    .groupBy("post_year", "category") \
    .agg(count("category").alias("category_count")) \
    .orderBy("category_count", ascending=False)

display(category_num_post)

post_year,category,category_count
2018,art,4
2019,education,4
2018,beauty,4
2021,tattoos,3
2019,travel,3
2018,travel,3
2021,diy-and-crafts,3
2019,event-planning,3
2018,christmas,3
2020,art,3


1. For each country find the user with the most followers.

Your query should return a DataFrame that contains the following columns:
 - country
 - poster_name
 - follower_count

2. Based on the above query, find the country with the user with most followers.

Your query should return a DataFrame that contains the following columns:

 - country
 - follower_count

This DataFrame should have only one entry.

In [None]:
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("country").orderBy(col("follower_count").desc())

user_country_followers = joined_df \
    .withColumn("max_follower_count", max("follower_count").over(windowSpec)) \
    .where(col("follower_count") == col("max_follower_count")) \
    .select("country", "user_name", "follower_count") \
    .dropDuplicates() 

display(user_country_followers)

country,user_name,follower_count
Afghanistan,Alexandria Alvarado,2000000
Albania,Christina Davenport,117000
Algeria,Alexis Bennett,326000
American Samoa,Aaron Abbott,5000000
Andorra,Alison Bell,1000000
Angola,David Griffith,502000
Anguilla,Corey Andrews,92000
Antarctica (the territory South of 60 deg S),Amanda Adams,500000
Antigua and Barbuda,Denise Adams,306000
Argentina,Andrew Anderson,800000


In [None]:
highest_follower_country = user_country_followers \
    .groupBy("country") \
    .agg(max("follower_count").alias("follower_count")) \
    .orderBy(desc("follower_count")) \
    .limit(1)

display(highest_follower_country)

country,follower_count
Azerbaijan,6000000


What is the most popular category people post to based on the following age groups:

 - 18-24
 - 25-35
 - 36-50
 - +50

Your query should return a DataFrame that contains the following columns:

 - age_group, a new column based on the original age column
 - category
 - category_count, a new column containing the desired query output

In [None]:
# Define a custom sorting order for cities
age_order = ["18 - 24", "25 - 35", "36 - 50", "+50"]

# Create a custom sorting column for "age_group"
custom_sort_col = when(col("age_group") == age_order[0], 0) \
    .when(col("age_group") == age_order[1], 1) \
    .when(col("age_group") == age_order[2], 2) \
    .otherwise(3)
    
popular_category_age = joined_df \
    .withColumn("age_group", when((joined_df.age >= 18) & (joined_df.age <= 24), "18 - 24")
                .when((joined_df.age >= 25) & (joined_df.age <= 35), "25 - 35")
                .when((joined_df.age >= 36) & (joined_df.age <= 50), "36 - 50")
                .otherwise("+50")) \
    .groupBy("age_group", "category") \
    .agg(count("category").alias("category_count")) \
    .orderBy(custom_sort_col, "category_count", ascending=False)

display(popular_category_age)
    

age_group,category,category_count
+50,education,2
+50,beauty,2
+50,event-planning,1
+50,art,1
36 - 50,diy-and-crafts,3
36 - 50,vehicles,2
36 - 50,tattoos,1
36 - 50,finance,1
36 - 50,education,1
36 - 50,beauty,1


What is the median follower count for users in the following age groups:

 - 18-24
 - 25-35
 - 36-50
 - +50

Your query should return a DataFrame that contains the following columns:

 - age_group, a new column based on the original age column
 - median_follower_count, a new column containing the desired query output

In [None]:
median_followers_count = joined_df \
    .withColumn("age_group", when((joined_df.age >= 18) & (joined_df.age <= 24), '18 - 24')
                .when((joined_df.age >= 25) & (joined_df.age <= 35), '25 - 35')
                .when((joined_df.age >= 36) & (joined_df.age <= 50), '36 - 50')
                .otherwise('+50')) \
    .groupBy("age_group") \
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .orderBy("median_follower_count", ascending=False)

display(median_followers_count )

age_group,median_follower_count
18 - 24,110000
25 - 35,42000
36 - 50,7000
+50,5000


Find how many users have joined between 2015 and 2020.


Your query should return a DataFrame that contains the following columns:

 - post_year, a new column that contains only the year from the timestamp column
 - number_users_joined, a new column containing the desired query output

In [None]:
users_year_joined = joined_df \
    .withColumn("join_year", year('date_joined')) \
    .groupBy("join_year") \
    .agg(count("user_name").alias("numbers_users_joined")) \
    .orderBy("numbers_users_joined", ascending=False)

display(users_year_joined)

join_year,numbers_users_joined
2016,40
2015,36
2017,12


Find the median follower count of users have joined between 2015 and 2020.


Your query should return a DataFrame that contains the following columns:

 - post_year, a new column that contains only the year from the timestamp column
 - median_follower_count, a new column containing the desired query output

In [None]:
med_follower_year = joined_df \
    .withColumn("join_year", year('date_joined')) \
    .groupBy("join_year") \
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .orderBy("median_follower_count", ascending=False)

display(med_follower_year)

join_year,median_follower_count
2015,85000
2016,27000
2017,5000


Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.


Your query should return a DataFrame that contains the following columns:

 - age_group, a new column based on the original age column
 - post_year, a new column that contains only the year from the timestamp column
 - median_follower_count, a new column containing the desired query output

In [None]:
med_follower_year_agegroup = joined_df \
    .withColumn("age_group", when((joined_df.age >= 18) & (joined_df.age <= 24), '18 - 24')
            .when((joined_df.age >= 25) & (joined_df.age <= 35), '25 - 35')
            .when((joined_df.age >= 36) & (joined_df.age <= 50), '36 - 50')
            .otherwise('+50')) \
    .withColumn("join_year", year('date_joined')) \
    .groupBy("age_group","join_year") \
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .orderBy(custom_sort_col, "median_follower_count")

display(med_follower_year_agegroup)

age_group,join_year,median_follower_count
18 - 24,2017,940
18 - 24,2016,46000
18 - 24,2015,211000
25 - 35,2016,27000
25 - 35,2015,42000
25 - 35,2017,112000
36 - 50,2017,314
36 - 50,2016,9000
36 - 50,2015,25000
+50,2017,5000
