In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import *

In [0]:
# Clean df_pin
def clean_df_pin(df_pin: DataFrame) -> DataFrame:
    # Replace empty or irrelevant entries with None
    markers_to_replace = ["", "No description available", "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"]
    for column_name in df_pin.columns:
        df_pin = df_pin.withColumn(column_name, when(col(column_name).isin(markers_to_replace) | col(column_name).isNull(), None).otherwise(col(column_name)))

    # Clean and convert follower_count to integer
    df_pin = df_pin.withColumn("follower_count",
                               when(col("follower_count").contains("k"), (regexp_replace(col("follower_count"), "k", "").cast("float") * 1000))
                               .when(col("follower_count").contains("M"), (regexp_replace(col("follower_count"), "M", "").cast("float") * 1000000))
                               .otherwise(col("follower_count").cast("float")))

    # Ensure 'downloaded' and 'index' columns are numeric
    df_pin = df_pin.withColumn("downloaded", col("downloaded").cast("int"))
    df_pin = df_pin.withColumn("index", col("index").cast("int"))

    # Clean 'save_location' column to keep only the location path
    df_pin = df_pin.withColumn("save_location", regexp_replace(col("save_location"), r"https?://[^/]+/", ""))

    # Rename 'index' column to 'ind'
    df_pin = df_pin.withColumnRenamed("index", "ind")

    # Reorder the DataFrame columns
    column_order = ["ind", "unique_id", "title", "description", "follower_count", 
                    "poster_name", "tag_list", "is_image_or_video", "image_src", 
                    "save_location", "category"]

    # Remove duplicates
    df_pin = df_pin.dropDuplicates(["ind", "category"])

    df_pin = df_pin.select(column_order)

    return df_pin


In [0]:
# Clean df_geo
def clean_df_geo(df_geo: DataFrame) -> DataFrame:
    # Create 'coordinates' array column
    df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))

    # Drop latitude and longitude columns
    df_geo = df_geo.drop("latitude", "longitude")

    # Convert 'timestamp' to a proper timestamp data type
    df_geo = df_geo.withColumn("timestamp", to_timestamp(col("timestamp")))

    # Ensure 'ind' is numeric
    df_geo = df_geo.withColumn("ind", col("ind").cast("int"))

    # Reorder the DataFrame columns
    column_order = ["ind", "country", "coordinates", "timestamp"]

    # Remove duplicates
    df_geo = df_geo.dropDuplicates(["ind", "country"])

    df_geo = df_geo.select(column_order)

    return df_geo


In [0]:
def clean_df_geo(df_geo: DataFrame) -> DataFrame:
    """
    Cleans the geo DataFrame by creating a coordinates array and formatting the timestamp.
    
    Parameters:
    df_geo (DataFrame): Spark DataFrame to be cleaned.
    
    Returns:
    DataFrame: Cleaned Spark DataFrame.
    """
    # Create 'coordinates' array column
    df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))

    # Drop latitude and longitude columns
    df_geo = df_geo.drop("latitude", "longitude")

    # Convert 'timestamp' to a proper timestamp data type
    df_geo = df_geo.withColumn("timestamp", to_timestamp(col("timestamp")))

    # Ensure 'ind' is numeric
    df_geo = df_geo.withColumn("ind", col("ind").cast("int"))

    # Reorder the DataFrame columns
    column_order = ["ind", "country", "coordinates", "timestamp"]
    df_geo = df_geo.select(column_order)

    return df_geo


In [0]:
def clean_df_user(df_user: DataFrame) -> DataFrame:
    """
    Cleans the user DataFrame by concatenating first and last names and formatting the date.
    
    Parameters:
    df_user (DataFrame): Spark DataFrame to be cleaned.
    a
    Returns:
    DataFrame: Cleaned Spark DataFrame.
    """
    # Concatenate 'first_name' and 'last_name' into 'user_name'
    df_user = df_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

    # Drop the 'first_name' and 'last_name' columns
    df_user = df_user.drop("first_name", "last_name")

    # Convert 'date_joined' to a proper timestamp data type
    df_user = df_user.withColumn("date_joined", to_timestamp(col("date_joined")))

    # Ensure 'ind' and 'age' are numeric
    df_user = df_user.withColumn("ind", col("ind").cast("int"))
    df_user = df_user.withColumn("age", col("age").cast("int"))

    # Reorder the DataFrame columns
    column_order = ["ind", "user_name", "age", "date_joined"]
    df_user = df_user.select(column_order)
    
    return df_user