In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count ,date_format,to_date,to_timestamp

In [2]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("DataProcessing") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/14 12:11:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark 

In [5]:
#Customers Schema
customers_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("city", StringType(), True),
])   

In [6]:
# Reading Customer CSV
customers_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(customers_schema) \
    .load("../data/customers.csv")




In [7]:
# Check schemas
print("Customers DataFrame Schema:")
customers_df.printSchema()

Customers DataFrame Schema:
root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)



###  Checkin the number of rows in the data

In [8]:
num_rows = customers_df.count()
print(f"Number of rows: {num_rows}")


Number of rows: 35


### Check for missing values

In [10]:
# Counting missing values for each column
missing_values = customers_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in customers_df.columns]
)
missing_values.show()

+-----------+-------------+----+
|customer_id|customer_name|city|
+-----------+-------------+----+
|          0|            0|   0|
+-----------+-------------+----+



### Dropping  null values

In [11]:
hosts_df = customers_df.dropna()

In [12]:
# Counting missing values for each column
missing_values = customers_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in customers_df.columns]
)
missing_values.show()

+-----------+-------------+----+
|customer_id|customer_name|city|
+-----------+-------------+----+
|          0|            0|   0|
+-----------+-------------+----+



In [13]:
customers_df.show(5)

+-----------+-----------------+---------+
|customer_id|    customer_name|     city|
+-----------+-----------------+---------+
|     789201|        Rel Fresh|    Surat|
|     789202|        Rel Fresh|Ahmedabad|
|     789203|        Rel Fresh| Vadodara|
|     789301|Expression Stores|    Surat|
|     789303|Expression Stores| Vadodara|
+-----------+-----------------+---------+
only showing top 5 rows



### Check for Duplicate values

In [15]:
# Checking for duplicates based on customer_id
duplicate_count = customers_df.count() - customers_df.dropDuplicates(["customer_id"]).count()

# Displaying the number of duplicate rows
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


-  There are no duplicate values

### Data Type Validation

In [16]:
#Check the data type
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)



### Separating  date and time with formatted date

In [14]:
hosts_df = (
    hosts_df
    .withColumn("date_created", to_date(col("created_at"), "yyyy-MM-dd"))
    .withColumn("time_created", date_format(col("created_at"), "HH:mm:ss"))

    .withColumn("date_updated", to_date(col("updated_at"), "yyyy-MM-dd"))
    .withColumn("time_updated", date_format(col("updated_at"), "HH:mm:ss"))
    
)

In [15]:
#Checking the transfformed data
selected_columns = ['created_at', 'updated_at','date_created','time_created', 'date_updated', 'time_updated']
hosts_df.select(selected_columns).show(5)

+-------------------+-------------------+------------+------------+------------+------------+
|         created_at|         updated_at|date_created|time_created|date_updated|time_updated|
+-------------------+-------------------+------------+------------+------------+------------+
|2014-01-05 16:12:45|2014-01-05 16:12:45|  2014-01-05|    16:12:45|  2014-01-05|    16:12:45|
|2013-07-31 23:29:31|2013-07-31 23:29:31|  2013-07-31|    23:29:31|  2013-07-31|    23:29:31|
|2017-10-17 05:20:28|2017-10-17 05:20:28|  2017-10-17|    05:20:28|  2017-10-17|    05:20:28|
|2009-06-05 21:34:42|2009-06-05 21:34:42|  2009-06-05|    21:34:42|  2009-06-05|    21:34:42|
|2021-10-24 02:42:09|2021-10-24 02:42:09|  2021-10-24|    02:42:09|  2021-10-24|    02:42:09|
+-------------------+-------------------+------------+------------+------------+------------+
only showing top 5 rows



### Converting is_superhost   from String to Boolean 

In [16]:
# Converting  is_superhost to boolean
hosts_df = hosts_df.withColumn(
    "is_superhost",
    when(col("is_superhost") == "t", True).when(col("is_superhost") == "f", False).otherwise(None)
)

In [17]:
hosts_df.show(5,truncate=False)

+-----+-------+------------+-------------------+-------------------+------------+------------+------------+------------+
|id   |name   |is_superhost|created_at         |updated_at         |date_created|time_created|date_updated|time_updated|
+-----+-------+------------+-------------------+-------------------+------------+------------+------------+------------+
|1581 |Annette|false       |2014-01-05 16:12:45|2014-01-05 16:12:45|2014-01-05  |16:12:45    |2014-01-05  |16:12:45    |
|2164 |Lulah  |true        |2013-07-31 23:29:31|2013-07-31 23:29:31|2013-07-31  |23:29:31    |2013-07-31  |23:29:31    |
|2217 |Ion    |true        |2017-10-17 05:20:28|2017-10-17 05:20:28|2017-10-17  |05:20:28    |2017-10-17  |05:20:28    |
|3718 |Britta |false       |2009-06-05 21:34:42|2009-06-05 21:34:42|2009-06-05  |21:34:42    |2009-06-05  |21:34:42    |
|11622|Maria  |false       |2021-10-24 02:42:09|2021-10-24 02:42:09|2021-10-24  |02:42:09    |2021-10-24  |02:42:09    |
+-----+-------+------------+----

In [18]:
hosts_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- is_superhost: boolean (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- date_created: date (nullable = true)
 |-- time_created: string (nullable = true)
 |-- date_updated: date (nullable = true)
 |-- time_updated: string (nullable = true)



###   2. Listings Data Transformations

In [19]:
#Listings schema
listings_schema = StructType([
    StructField("id", StringType(), True),
    StructField("listing_url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("room_type", StringType(), True),
    StructField("minimum_nights", IntegerType(), True),
    StructField("host_id", StringType(), True),
    StructField("price", StringType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])

In [20]:
# Reading Listings Data
listings_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(listings_schema) \
    .load("../data/listings.csv")

In [21]:
# Check schemas
print("Listings DataFrame Schema:")
listings_df.printSchema()

Listings DataFrame Schema:
root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- host_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [22]:
listings_df.show(5,truncate=False)

+-----+----------------------------------+-----------------------------------+---------------+--------------+-------+-------+-------------------+-------------------+
|id   |listing_url                       |name                               |room_type      |minimum_nights|host_id|price  |created_at         |updated_at         |
+-----+----------------------------------+-----------------------------------+---------------+--------------+-------+-------+-------------------+-------------------+
|3176 |https://www.airbnb.com/rooms/3176 |Fabulous Flat in great Location    |Entire home/apt|62            |3718   |$90.00 |2009-06-05 21:34:42|2009-06-05 21:34:42|
|7071 |https://www.airbnb.com/rooms/7071 |BrightRoom with sunny greenview!   |Private room   |1             |17391  |$33.00 |2009-08-12 12:30:30|2009-08-12 12:30:30|
|9991 |https://www.airbnb.com/rooms/9991 |Geourgeous flat - outstanding views|Entire home/apt|0             |33852  |$180.00|2015-07-30 05:08:52|2015-07-30 05:08:52|
|143

###  Checking the number of rows in the data

In [23]:
num_rows = listings_df.count()
print(f"Number of rows: {num_rows}")

Number of rows: 17499


### Check for missing values

In [24]:
# Count missing values for each column
missing_values = listings_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in listings_df.columns]
)
missing_values.show()

+---+-----------+----+---------+--------------+-------+-----+----------+----------+
| id|listing_url|name|room_type|minimum_nights|host_id|price|created_at|updated_at|
+---+-----------+----+---------+--------------+-------+-----+----------+----------+
|  0|          0|   0|        0|            59|      0|    0|        59|         0|
+---+-----------+----+---------+--------------+-------+-----+----------+----------+



- There are  missing values in  minimum_nights  and created_at
- Since the values are few , we can just dropp them

### Dropping null values

In [25]:
listings_df = listings_df.dropna()

In [26]:
# Counting  missing values for each column
missing_values = listings_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in listings_df.columns]
)
missing_values.show()

+---+-----------+----+---------+--------------+-------+-----+----------+----------+
| id|listing_url|name|room_type|minimum_nights|host_id|price|created_at|updated_at|
+---+-----------+----+---------+--------------+-------+-----+----------+----------+
|  0|          0|   0|        0|             0|      0|    0|         0|         0|
+---+-----------+----+---------+--------------+-------+-----+----------+----------+



- All the missing values have been dropped

###  Checking for duplicates based on id

In [27]:
duplicate_count = listings_df.count() - listings_df.dropDuplicates(["id"]).count()

# Displaying the number of duplicate rows
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


- There are no duplicate values

### Filtering  out invalid data

- Removing $ on Price and converting it to a  double data type  
-  Excluding  rows with invalid or nonsensical values eg   where price is zero or negative.
and minimum nights is greater than a year (365 days)

In [28]:
# Clean and convert price column
listings_df = (
    listings_df
    .withColumn("price", 
        regexp_extract(
            regexp_replace(
                trim(col("price")), 
                "^\\$", ""  # Remove leading $ sign
            ), 
            "^\\d+(\\.\\d+)?", 0  # Extract numeric part
        )
    )
    .withColumn("price", 
        when(
            col("price").rlike("^\\d+(\\.\\d+)?$"),  # Validate numeric format
            col("price").cast("double")
        ).otherwise(None)  # Handle non-numeric entries
    )
    .filter(col("price").isNotNull())
    .filter((col("price") > 0) & (col("price") <= 10000))  # Optional price range filter
    .filter((col("minimum_nights") > 0) & (col("minimum_nights") <= 365))
    
)

In [29]:
listings_df.show(5,truncate=False)

+-----+----------------------------------+---------------------------------------------+---------------+--------------+-------+-----+-------------------+-------------------+
|id   |listing_url                       |name                                         |room_type      |minimum_nights|host_id|price|created_at         |updated_at         |
+-----+----------------------------------+---------------------------------------------+---------------+--------------+-------+-----+-------------------+-------------------+
|3176 |https://www.airbnb.com/rooms/3176 |Fabulous Flat in great Location              |Entire home/apt|62            |3718   |90.0 |2009-06-05 21:34:42|2009-06-05 21:34:42|
|7071 |https://www.airbnb.com/rooms/7071 |BrightRoom with sunny greenview!             |Private room   |1             |17391  |33.0 |2009-08-12 12:30:30|2009-08-12 12:30:30|
|14325|https://www.airbnb.com/rooms/14325|Apartment in Prenzlauer Berg                 |Entire home/apt|95            |55531  |70.

In [30]:
listings_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- host_id: string (nullable = true)
 |-- price: double (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)



### Data Type Validation

### Separating  date and time with formatted date

In [31]:
listings_df = (
    listings_df
    .withColumn("date_created", to_date(col("created_at"), "yyyy-MM-dd"))
    .withColumn("time_created", date_format(col("created_at"), "HH:mm:ss"))

    .withColumn("date_updated", to_date(col("updated_at"), "yyyy-MM-dd"))
    .withColumn("time_updated", date_format(col("updated_at"), "HH:mm:ss"))
    
)

In [32]:
selected_columns = ['created_at', 'updated_at','date_created','time_created', 'date_updated', 'time_updated']
listings_df.select(selected_columns).show(5,truncate=False)

+-------------------+-------------------+------------+------------+------------+------------+
|created_at         |updated_at         |date_created|time_created|date_updated|time_updated|
+-------------------+-------------------+------------+------------+------------+------------+
|2009-06-05 21:34:42|2009-06-05 21:34:42|2009-06-05  |21:34:42    |2009-06-05  |21:34:42    |
|2009-08-12 12:30:30|2009-08-12 12:30:30|2009-08-12  |12:30:30    |2009-08-12  |12:30:30    |
|2010-06-15 19:56:01|2010-06-15 19:56:01|2010-06-15  |19:56:01    |2010-06-15  |19:56:01    |
|2010-05-30 12:11:33|2010-05-30 12:11:33|2010-05-30  |12:11:33    |2010-05-30  |12:11:33    |
|2010-02-08 17:23:48|2010-02-08 17:23:48|2010-02-08  |17:23:48    |2010-02-08  |17:23:48    |
+-------------------+-------------------+------------+------------+------------+------------+
only showing top 5 rows



### 3. Reviews  Data Transformation

In [33]:
#Reviews Schema
reviews_schema = StructType([
    StructField("listing_id", StringType(), True),
    StructField("date", TimestampType(), True),
    StructField("reviewer_name", StringType(), True),
    StructField("comments", StringType(), True),
    StructField("sentiment", StringType(), True)
])

In [34]:
# Reading Reviews Data
reviews_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(reviews_schema) \
    .load("../data/reviews.csv")

In [35]:
# Checking the  schema
print("Reviews DataFrame Schema:")
reviews_df.printSchema()

Reviews DataFrame Schema:
root
 |-- listing_id: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [36]:
reviews_df.show(5)

+----------+-------------------+-------------+--------------------+---------+
|listing_id|               date|reviewer_name|            comments|sentiment|
+----------+-------------------+-------------+--------------------+---------+
|      3176|2009-06-20 00:00:00|        Milan|excellent stay, i...| positive|
|      3176|2010-11-07 00:00:00|       George|Brittas apartment...| positive|
|      3176|2010-11-24 00:00:00|     Patricia|Fantastic, large ...| positive|
|      3176|2010-12-21 00:00:00|    Benedetta|Lappartamento di ...|  neutral|
|      3176|2011-01-04 00:00:00|         Aude|We went in Berlin...| positive|
+----------+-------------------+-------------+--------------------+---------+
only showing top 5 rows



###  Checking the number of rows in the data

In [37]:
num_rows = reviews_df.count()
print(f"Number of rows: {num_rows}")

Number of rows: 410284


### Check for missing values

In [38]:
# Count missing values for each column
missing_values = reviews_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in reviews_df.columns]
)
missing_values.show()

[Stage 46:>                                                         (0 + 4) / 4]

+----------+----+-------------+--------+---------+
|listing_id|date|reviewer_name|comments|sentiment|
+----------+----+-------------+--------+---------+
|         0|   0|            0|     587|      385|
+----------+----+-------------+--------+---------+



                                                                                

- As  shown above the missing values are  in non-critical columns (comments or sentiment)
- In this case the missing values will be  replaced by placeholders (e.g., "Unknown").

### Imputing unknown values where   the data is missing 

In [39]:
# Impute "Unknown" for missing values in comments and sentiment columns
reviews_df = (
    reviews_df
    .withColumn("comments", 
        when(col("comments").isNull() | (col("comments") == ""), 
             lit("Unknown"))
        .otherwise(col("comments"))
    )
    .withColumn("sentiment", 
        when(col("sentiment").isNull() | (col("sentiment") == ""), 
             lit("Unknown"))
        .otherwise(col("sentiment"))
    )
)

In [40]:
# Counting missing values for each column
missing_values = reviews_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in reviews_df.columns]
)
missing_values.show()

+----------+----+-------------+--------+---------+
|listing_id|date|reviewer_name|comments|sentiment|
+----------+----+-------------+--------+---------+
|         0|   0|            0|       0|        0|
+----------+----+-------------+--------+---------+



                                                                                

### Splitting the date column into day and time

In [41]:
reviews_df = (
    reviews_df
    .withColumn("date1", to_date(col("date"), "yyyy-MM-dd"))
    .withColumn("time", date_format(col("date"), "HH:mm:ss"))
    .drop("date")
    .withColumnRenamed("date1", "date")
)

In [42]:
reviews_df.show(5)

+----------+-------------+--------------------+---------+----------+--------+
|listing_id|reviewer_name|            comments|sentiment|      date|    time|
+----------+-------------+--------------------+---------+----------+--------+
|      3176|        Milan|excellent stay, i...| positive|2009-06-20|00:00:00|
|      3176|       George|Brittas apartment...| positive|2010-11-07|00:00:00|
|      3176|     Patricia|Fantastic, large ...| positive|2010-11-24|00:00:00|
|      3176|    Benedetta|Lappartamento di ...|  neutral|2010-12-21|00:00:00|
|      3176|         Aude|We went in Berlin...| positive|2011-01-04|00:00:00|
+----------+-------------+--------------------+---------+----------+--------+
only showing top 5 rows



In [43]:
reviews_df.printSchema()


root
 |-- listing_id: string (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)



In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, lit, regexp_replace, trim, 
    regexp_extract, isnan, count, date_format,
    to_date, to_timestamp
)
from pyspark.sql.types import *

def create_spark_session(app_name="DataProcessing"):
    """
    Create a SparkSession.

    Args:
        app_name (str): The name of the Spark application.

    Returns:
        SparkSession: A Spark session object.
    """
    return SparkSession.builder.appName(app_name).getOrCreate()

def read_csv_with_schema(spark, file_path, schema):
    """
    Read a CSV file with a specified schema.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the CSV file.
        schema (StructType): Schema to apply to the CSV file.

    Returns:
        DataFrame: A DataFrame containing the data.
    """
    return spark.read.format("csv").option("header", True).schema(schema).load(file_path)

def check_missing_values(df):
    """
    Check for missing values in each column.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: A DataFrame showing the count of missing values for each column.
    """
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

def handle_missing_values(df, strategy="drop"):
    """
    Handle missing values in a DataFrame.

    Args:
        df (DataFrame): The input DataFrame.
        strategy (str): Strategy to handle missing values ('drop' or 'retain').

    Returns:
        DataFrame: The DataFrame after handling missing values.
    """
    if strategy == "drop":
        return df.dropna()
    return df

def check_duplicates(df, id_column):
    """
    Check for duplicate rows based on a specific column.

    Args:
        df (DataFrame): The input DataFrame.
        id_column (str): The column used to identify duplicates.

    Returns:
        int: The count of duplicate rows.
    """
    duplicate_count = df.count() - df.dropDuplicates([id_column]).count()
    return duplicate_count

def transform_datetime_columns(df, timestamp_columns):
    """
    Split timestamp columns into date and time components.

    Args:
        df (DataFrame): The input DataFrame.
        timestamp_columns (list): List of column names containing timestamps.

    Returns:
        DataFrame: The DataFrame with split date and time components.
    """
    for col_name in timestamp_columns:
        df = (df
            .withColumn(f"date_{col_name}", to_date(col(col_name), "yyyy-MM-dd"))
            .withColumn(f"time_{col_name}", date_format(col(col_name), "HH:mm:ss"))
        )
    return df

def clean_price_column(df):
    """
    Clean and validate the price column.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame with a cleaned and validated price column.
    """
    return (df
        .withColumn("price", 
            regexp_extract(
                regexp_replace(trim(col("price")), "^\\$", ""),
                "^\\d+(\\.\\d+)?", 0
            )
        )
        .withColumn("price", 
            when(
                col("price").rlike("^\\d+(\\.\\d+)?$"),
                col("price").cast("double")
            ).otherwise(None)
        )
        .filter(col("price").isNotNull())
        .filter((col("price") > 0) & (col("price") <= 10000))
    )

def validate_listings(df):
    """
    Validate the listings based on minimum nights.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame after filtering valid listings.
    """
    return df.filter((col("minimum_nights") > 0) & (col("minimum_nights") <= 365))

def convert_superhost_to_boolean(df):
    """
    Convert the 'is_superhost' column to boolean.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: The DataFrame with a boolean 'is_superhost' column.
    """
    return df.withColumn(
        "is_superhost",
        when(col("is_superhost") == "t", True)
        .when(col("is_superhost") == "f", False)
        .otherwise(None)
    )

def impute_text_columns(df, columns, default_value="Unknown"):
    """
    Impute missing text columns with a default value.

    Args:
        df (DataFrame): The input DataFrame.
        columns (list): List of column names to impute.
        default_value (str): The default value for missing text.

    Returns:
        DataFrame: The DataFrame with imputed text columns.
    """
    for column in columns:
        df = df.withColumn(
            column,
            when(col(column).isNull() | (col(column) == ""), lit(default_value))
            .otherwise(col(column))
        )
    return df

def process_hosts_data(spark, file_path):
    """
    Process host data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the hosts CSV file.

    Returns:
        DataFrame: The processed host data.
    """
    hosts_schema = StructType([
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("is_superhost", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("updated_at", TimestampType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, hosts_schema)
          .transform(lambda df: handle_missing_values(df))
          .transform(convert_superhost_to_boolean)
          .transform(lambda df: transform_datetime_columns(df, ["created_at", "updated_at"])))
    return df

def process_listings_data(spark, file_path):
    """
    Process listing data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the listings CSV file.

    Returns:
        DataFrame: The processed listing data.
    """
    listings_schema = StructType([
        StructField("id", StringType(), True),
        StructField("listing_url", StringType(), True),
        StructField("name", StringType(), True),
        StructField("room_type", StringType(), True),
        StructField("minimum_nights", IntegerType(), True),
        StructField("host_id", StringType(), True),
        StructField("price", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("updated_at", TimestampType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, listings_schema)
          .transform(lambda df: handle_missing_values(df))
          .transform(clean_price_column)
          .transform(validate_listings)
          .transform(lambda df: transform_datetime_columns(df, ["created_at", "updated_at"])))
    return df

def process_reviews_data(spark, file_path):
    """
    Process review data from a CSV file.

    Args:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the reviews CSV file.

    Returns:
        DataFrame: The processed review data.
    """
    reviews_schema = StructType([
        StructField("listing_id", StringType(), True),
        StructField("date", TimestampType(), True),
        StructField("reviewer_name", StringType(), True),
        StructField("comments", StringType(), True),
        StructField("sentiment", StringType(), True)
    ])
    
    df = (read_csv_with_schema(spark, file_path, reviews_schema)
          .transform(lambda df: impute_text_columns(df, ["comments", "sentiment"]))
          .transform(lambda df: transform_datetime_columns(df, ["date"])))
    return df
