In [None]:
from pyspark.sql.functions import udf, col, when, lit, trim, array, concat_ws, to_timestamp
from pyspark.sql.types import IntegerType
from pyspark.sql import DataFrame

In [None]:
file_location = "/mnt/126ca3664fbb-mount/topics/126ca3664fbb.{}/partition=0/*.json"

def read_from_s3(
    table_name:str,
    infer_schema:str="true" # Ask Spark to infer the schema
) -> DataFrame:
  '''
  Reads in json file from S3 bucket,
  returns spark DataFrame.
  '''
  df = spark.read.format("json") \
    .option("inferSchema", infer_schema) \
    .load(file_location.format(table_name))
  return df

In [None]:

def none_empty_str(df: DataFrame) -> DataFrame:
  '''
  Cleans empty string values '' to `None`.
  '''
  df = df.select(
    [
        when(trim(col(c)) == "", lit(None)).otherwise(col(c)).alias(c) for c in df.columns
    ]
  )
  return df

def convert_followers_int(df: DataFrame) -> DataFrame:

    def convert_str_int(value: str | int) -> int | None:
        '''
        Subfunction convert string numbers to ints,
         including those with 'k' or 'M' units.
        '''
        if type(value) == int:
            return value
        else:
            try:
                value = value.strip()
                if value.endswith('k'):
                    return int(float(value[:-1]) * 1000)
                elif value.endswith('M'):
                    return int(float(value[:-1]) * 1000000)
                else:
                    return int(value)
            except ValueError:
                return None
    
    convert_str_int_udf = udf(lambda x: convert_str_int(x), IntegerType())

    df = df.withColumn(
        "follower_count",
        when(col("follower_count").isNotNull(), convert_str_int_udf(col("follower_count")))
        .otherwise(None)
    )
    return df

def rename_index(df: DataFrame) -> DataFrame:
    '''
    Renames the `index` column to `ind`.
    '''
    df = df.withColumnRenamed("index", "ind")
    return df

def reorder_pin_cols(df: DataFrame) -> DataFrame:
    '''
    Reorders the columns of the `pin` table.
    '''
    df = df.select(
        "ind", "unique_id", "title",
        "description", "follower_count",
        "poster_name", "tag_list",
        "is_image_or_video", "image_src",
        "save_location", "category",
        "downloaded"
    )
    return df

def clean_pin_table(df: DataFrame) -> DataFrame:
    '''
    Overall cleaning function for the `pin` table
    which incorporates the previous functions of
    this cell.
    '''
    df = none_empty_str(df)
    df = convert_followers_int(df)
    df = rename_index(df)
    df = reorder_pin_cols(df)
    return df

In [None]:

def make_lat_long(df: DataFrame) -> DataFrame:
  '''
  Merges latitude and longitude data into an array,
  and drops the original two columns.
  '''
  df = df.withColumn(
    "coordinates",
    array(col("latitude"), col("longitude"))
  ).drop("latitude", "longitude")
  return df

def convert_str_datetime(df: DataFrame, col: str) -> DataFrame:
  '''
  Convert the JSON string datetime to proper datetime format.
  '''
  df = df.withColumn(col, to_timestamp(col, "yyyy-MM-dd'T'HH:mm:ss"))
  return df

def reorder_geo_cols(df: DataFrame) -> DataFrame:
  '''
  Reorders the columns in te `geo` table.'''
  df = df.select("ind", "country", "coordinates", "timestamp")
  return df

def clean_geo_table(df: DataFrame) -> DataFrame:
  '''
    Overall cleaning function for the `geo` table
    which incorporates the previous functions of
    this cell.
    '''
  df = make_lat_long(df)
  df = convert_str_datetime(df, "timestamp")
  df = reorder_geo_cols(df)
  return df

In [None]:
def make_user_name(df: DataFrame) -> DataFrame:
  '''
  Combines first and last names into single username.
  '''
  df = df.withColumn(
    'user_name',
    concat_ws(
      ' ',
      col('first_name'),
      col('last_name')
    )
  ).drop('first_name', 'last_name')
  return df

def reorder_user_cols(df: DataFrame) -> DataFrame:
  '''
  Reorders `user` columns.
  '''
  df = df.select("ind", "user_name", "age", "date_joined")
  return df

def clean_user_table(df: DataFrame) -> DataFrame:
  '''
    Overall cleaning function for the `user` table
    which incorporates the previous functions of
    this cell.
    '''
  df = make_user_name(df)
  df = convert_str_datetime(df, "date_joined")
  df = reorder_user_cols(df)
  return df
