In [0]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

In [0]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [0]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-12d4ce482aeb-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/aws_data"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [0]:
display(dbutils.fs.ls("/mnt/aws_data/topics"))

path,name,size,modificationTime
dbfs:/mnt/aws_data/topics/12d4ce482aeb.geo/,12d4ce482aeb.geo/,0,1697553082529
dbfs:/mnt/aws_data/topics/12d4ce482aeb.pin/,12d4ce482aeb.pin/,0,1697553082529
dbfs:/mnt/aws_data/topics/12d4ce482aeb.user/,12d4ce482aeb.user/,0,1697553082529


In [0]:
df_geo = spark.read.json("/mnt/aws_data/topics/12d4ce482aeb.geo/partition=0/")
df_pin = spark.read.json("/mnt/aws_data/topics/12d4ce482aeb.pin/partition=0/")
df_user = spark.read.json("/mnt/aws_data/topics/12d4ce482aeb.user/partition=0/")

In [0]:
def get_missing_vals(df):
    for col in df.columns:
        print(f'{col} {(df.schema[col].dataType)}: {df.filter(df[col].isNull()).count()}')

get_missing_vals(df_pin)

In [0]:
from pyspark.sql.functions import col,when

def replace_invalid_data(df,col_name,value_to_replace):
    df = df.withColumn(col_name, \
            when(col(col_name).like(value_to_replace),None)
            .otherwise(col(col_name)))
    return df

obj = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}

for k,v in obj.items():
        df_pin = replace_invalid_data(df_pin,k,v)

In [0]:
from pyspark.sql.functions import regexp_replace, regexp_extract
df_pin = df_pin.withColumn("follower_count",regexp_replace("follower_count","k","000"))
df_pin = df_pin.withColumn("follower_count",regexp_replace("follower_count","M","000000"))
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast("int"))
df_pin = df_pin.withColumn("save_location",
                         regexp_extract("save_location",r'(?<=\bin\s)(.*)',1))
df_pin = df_pin.withColumnRenamed("index","ind")

new_pin_column_order = [
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
]
df_pin = df_pin.select(new_pin_column_order)

In [0]:
df_pin.printSchema()