<a href="https://colab.research.google.com/github/aswin24012004/Data-Projects/blob/main/GtaViceCityETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import IntegerType

In [2]:
def main():
    spark = SparkSession.builder \
        .appName("GtaViceCityETL") \
        .getOrCreate()

    try:
        print("Extracting data from GTA Vice_City.csv...")
        df = spark.read.csv('/content/GTA Vice_City.csv', header=True, inferSchema=True)
        print("Extraction complete.")

        # --- 2. TRANSFORM ---
        print("Transforming data...")
        # a. Select only the columns we need for our analysis.
        df_transformed = df.select(
            'id', 'language', 'review', 'created',
            'voted_up', 'votes_up', 'comment_count'
        )

        df_transformed = df_transformed.withColumn(
            'review',
            regexp_replace(col('review'), r'[^A-Za-z0-9\s]+', '')
        ).withColumn(
            'review',
            regexp_replace(col('review'), r'\s+', ' ')
        )

        # c. Cast the 'voted_up' column from boolean (or string) to an integer.
        df_transformed = df_transformed.withColumn(
            'voted_up',
            col('voted_up').cast(IntegerType())
        )
        print("Transformation complete.")

        print("\n--- Transformed Data (Top 20 Rows) ---")
        df_transformed.show()

    except Exception as e:
        if "Path does not exist" in str(e):
            print("Error: 'GTA Vice_City.csv' not found. Please ensure the file is in the same directory as the script.")
        else:
            print(f"An error occurred: {e}")

    finally:
        spark.stop()


In [4]:
if __name__ == "__main__":
    main()

Extracting data from GTA Vice_City.csv...
Extraction complete.
Transforming data...
Transformation complete.

--- Transformed Data (Top 20 Rows) ---
+--------------------+-------------------+--------------------+-------------------+--------+--------+-------------+
|                  id|           language|              review|            created|voted_up|votes_up|comment_count|
+--------------------+-------------------+--------------------+-------------------+--------+--------+-------------+
|           157337410|            english|Games good But Ro...|2024-02-01 16:00:22|    NULL|       0|            0|
|           157337371|            english|modders make it b...|2024-02-01 15:59:57|    NULL|       0|            0|
|           157337210|            english|          great game|2024-02-01 15:57:48|    NULL|       0|            0|
|           157336468|            english|                best|2024-02-01 15:47:51|    NULL|       0|            0|
|           157335380|            engli