In [0]:
def clean_pin_data(df):
    """
    Performs data transformation on pin data table 
    - normalises follower count column to be numeric (not letter abbreviations, e.g. k for 1000)
    - removes unnecessary pre-amble text from path pin is saved to
    - normalises cells with missing information to be represented as null values
    - renames index column
    - drops duplicate rows
    
    Returns: cleaned Spark dataframe
    """
    df_pin = df.withColumn("follower_count", when(df.follower_count.endswith("k"),regexp_replace(df.follower_count,"k","000")) \
                                                .when(df.follower_count.endswith("M"),regexp_replace(df.follower_count,"M","000000")) \
                                                .when(df.follower_count == "User Info Error", None)
                                                .otherwise(df.follower_count)
                                                )
    df_pin = df_pin.withColumn("follower_count", df_pin.follower_count.cast("integer"))
    df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
    df_pin = df_pin.withColumn("title", when(df_pin.title == "No Title Data Available", None).otherwise(df_pin.title))
    df_pin = df_pin.withColumn("description", when(df_pin.description == "No description available Story format", None).otherwise(df_pin.description))
    df_pin = df_pin.withColumn("image_src", when(df_pin.image_src == "Image src error.", None).otherwise(df_pin.image_src))
    df_pin = df_pin.withColumn("tag_list", when(df_pin.tag_list == "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", None).otherwise(df_pin.tag_list))
    df_pin = df_pin.withColumn("poster_name", when(df_pin.poster_name == "User Info Error", None).otherwise(df_pin.poster_name))
    df_pin = df_pin.withColumnRenamed('index', 'ind')
    df_pin = df_pin.select("ind", 
                        "unique_id", 
                        "title", 
                        "description", 
                        "follower_count",
                        "poster_name",
                        "tag_list",
                        "is_image_or_video",
                        "image_src",
                        "save_location",
                        "category")
    df_pin = df_pin.dropDuplicates()

    return df_pin

In [0]:
def clean_geo_data(df):
    """
    Performs data transformation on geo data table 
    - merges latitude and longitude columns
    - casts timestamp column to timestamp format
    - drops duplicate rows
    
    Returns: cleaned Spark dataframe
    """
    df_geo = df.withColumn("coordinates", array("latitude", "longitude"))
    df_geo = df_geo.drop("latitude", "longitude")
    df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast("timestamp"))
    df_geo = df_geo.dropDuplicates()

    return df_geo

In [0]:
def clean_user_data(df):
    """
    Performs data transformation on user data table 
    - merges name columns
    - casts dates to timestamp format
    - drops duplicate rows
    
    Returns: cleaned Spark dataframe
    """
    df_user = df.withColumn("user_name", concat_ws(" ", "first_name", "last_name"))
    df_user = df_user.drop("first_name", "last_name")
    df_user = df_user.withColumn("date_joined", df_user["date_joined"].cast("timestamp"))
    df_user = df_user.select("ind",
                            "user_name",
                            "age",
                            "date_joined")
    df_user = df_user.dropDuplicates()

    return df_user