In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, lit, regexp_replace, trim, 
    regexp_extract, isnan, count, date_format,
    to_date, to_timestamp
)
from pyspark.sql.types import *

def create_spark_session(app_name="DataProcessing"):
    """
    Create a SparkSession.

    Args:
        app_name (str): The name of the Spark application.

    Returns:
        SparkSession: A Spark session object.
    """
    return SparkSession.builder.appName(app_name).getOrCreate()

def read_csv_with_schema(spark, file_path, schema):
    """
    Read a CSV file with a specified schema.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the CSV file.
        schema (StructType): Schema to apply to the CSV file.

    Returns:
        DataFrame: A DataFrame containing the data.
    """
    return spark.read.format("csv").option("header", True).schema(schema).load(file_path)

def check_missing_values(df):
    """
    Check for missing values in each column.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: A DataFrame showing the count of missing values for each column.
    """
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

def handle_missing_values(df, strategy="drop"):
    """
    Handle missing values in a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        strategy (str): Strategy to handle missing values ('drop' or 'retain').

    Returns:
        DataFrame: The DataFrame after handling missing values.
    """
    if strategy == "drop":
        return df.dropna()
    return df

def check_duplicates(df, id_column):
    """
    Check for duplicate rows based on a specific column.

    Args:
        df (DataFrame): The input DataFrame.
        id_column (str): The column used to identify duplicates.

    Returns:
        int: The count of duplicate rows.
    """
    duplicate_count = df.count() - df.dropDuplicates([id_column]).count()
    return duplicate_count

def transform_datetime_columns(df, timestamp_columns):
    """
    Split timestamp columns into date and time components.

    Args:
        df (DataFrame): The input DataFrame.
        timestamp_columns (list): List of column names containing timestamps.

    Returns:
        DataFrame: The DataFrame with split date and time components.
    """
    for col_name in timestamp_columns:
        df = (df
            .withColumn(f"date_{col_name}", to_date(col(col_name), "yyyy-MM-dd"))
            .withColumn(f"time_{col_name}", date_format(col(col_name), "HH:mm:ss"))
        )
    return df

def clean_price_column(df):
    """
    Clean and validate the price column.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame with a cleaned and validated price column.
    """
    return (df
        .withColumn("price", 
            regexp_extract(
                regexp_replace(trim(col("price")), "^\\$", ""),
                "^\\d+(\\.\\d+)?", 0
            )
        )
        .withColumn("price", 
            when(
                col("price").rlike("^\\d+(\\.\\d+)?$"),
                col("price").cast("double")
            ).otherwise(None)
        )
        .filter(col("price").isNotNull())
        .filter((col("price") > 0) & (col("price") <= 10000))
    )

def validate_listings(df):
    """
    Validate the listings based on minimum nights.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame after filtering valid listings.
    """
    return df.filter((col("minimum_nights") > 0) & (col("minimum_nights") <= 365))

def convert_superhost_to_boolean(df):
    """
    Convert the 'is_superhost' column to boolean.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame with a boolean 'is_superhost' column.
    """
    return df.withColumn(
        "is_superhost",
        when(col("is_superhost") == "t", True)
        .when(col("is_superhost") == "f", False)
        .otherwise(None)
    )

def impute_text_columns(df, columns, default_value="Unknown"):
    """
    Impute missing text columns with a default value.

    Args:
        df (DataFrame): The input DataFrame.
        columns (list): List of column names to impute.
        default_value (str): The default value for missing text.

    Returns:
        DataFrame: The DataFrame with imputed text columns.
    """
    for column in columns:
        df = df.withColumn(
            column,
            when(col(column).isNull() | (col(column) == ""), lit(default_value))
            .otherwise(col(column))
        )
    return df

def process_hosts_data(spark, file_path):
    """
    Process host data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the hosts CSV file.

    Returns:
        DataFrame: The processed host data.
    """
    hosts_schema = StructType([
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("is_superhost", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("updated_at", TimestampType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, hosts_schema)
          .transform(lambda df: handle_missing_values(df))
          .transform(convert_superhost_to_boolean)
          .transform(lambda df: transform_datetime_columns(df, ["created_at", "updated_at"])))
    return df

def process_listings_data(spark, file_path):
    """
    Process listing data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the listings CSV file.

    Returns:
        DataFrame: The processed listing data.
    """
    listings_schema = StructType([
        StructField("id", StringType(), True),
        StructField("listing_url", StringType(), True),
        StructField("name", StringType(), True),
        StructField("room_type", StringType(), True),
        StructField("minimum_nights", IntegerType(), True),
        StructField("host_id", StringType(), True),
        StructField("price", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("updated_at", TimestampType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, listings_schema)
          .transform(lambda df: handle_missing_values(df))
          .transform(clean_price_column)
          .transform(validate_listings)
          .transform(lambda df: transform_datetime_columns(df, ["created_at", "updated_at"])))
    return df

def process_reviews_data(spark, file_path):
    """
    Process review data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the reviews CSV file.

    Returns:
        DataFrame: The processed review data.
    """
    reviews_schema = StructType([
        StructField("listing_id", StringType(), True),
        StructField("date", TimestampType(), True),
        StructField("reviewer_name", StringType(), True),
        StructField("comments", StringType(), True),
        StructField("sentiment", StringType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, reviews_schema)
          .transform(lambda df: impute_text_columns(df, ["comments", "sentiment"]))
          .transform(lambda df: transform_datetime_columns(df, ["date"])))
    return df


In [2]:
spark = create_spark_session()
hosts_df = process_hosts_data(spark, "../data/hosts.csv")
listings_df = process_listings_data(spark, "../data/listings.csv")
reviews_df = process_reviews_data(spark, "../data/reviews.csv")

25/01/14 11:15:38 WARN Utils: Your hostname, codespaces-240eb1 resolves to a loopback address: 127.0.0.1; using 10.0.1.51 instead (on interface eth0)
25/01/14 11:15:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/14 11:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.ExceptionInInitializerError
	at org.apache.spark.unsafe.array.ByteArrayMethods.<clinit>(ByteArrayMethods.java:56)
	at org.apache.spark.memory.MemoryManager.defaultPageSizeBytes$lzycompute(MemoryManager.scala:264)
	at org.apache.spark.memory.MemoryManager.defaultPageSizeBytes(MemoryManager.scala:254)
	at org.apache.spark.memory.MemoryManager.$anonfun$pageSizeBytes$1(MemoryManager.scala:273)
	at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.memory.MemoryManager.<init>(MemoryManager.scala:273)
	at org.apache.spark.memory.UnifiedMemoryManager.<init>(UnifiedMemoryManager.scala:58)
	at org.apache.spark.memory.UnifiedMemoryManager$.apply(UnifiedMemoryManager.scala:207)
	at org.apache.spark.SparkEnv$.create(SparkEnv.scala:320)
	at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:194)
	at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:279)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:464)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.lang.IllegalStateException: java.lang.NoSuchMethodException: java.nio.DirectByteBuffer.<init>(long,int)
	at org.apache.spark.unsafe.Platform.<clinit>(Platform.java:113)
	... 25 more
Caused by: java.lang.NoSuchMethodException: java.nio.DirectByteBuffer.<init>(long,int)
	at java.base/java.lang.Class.getConstructor0(Class.java:3761)
	at java.base/java.lang.Class.getDeclaredConstructor(Class.java:2930)
	at org.apache.spark.unsafe.Platform.<clinit>(Platform.java:71)
	... 25 more
