## Install PySpark

In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bb3917a42031cc2863a57c06e481281d96d8b540fd03498d256aff244f0d14ae
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


## 1. Start a PySpark Session

In [2]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
	.appName("WriteToParquet") \
	.getOrCreate()


## 2. Generating a Sample Dataset

In [3]:
import random
import pandas as pd

# Function to generate random transaction data
def generate_data(n):
    customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]
    product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']

    data = []
    for _ in range(n):
        customer_id = random.choice(customer_ids)
        transaction_id = f'T{str(random.randint(10000, 99999))}'
        transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')
        amount = round(random.uniform(5, 500), 2)
        product_category = random.choice(product_categories)
        data.append((customer_id, transaction_id, transaction_date, amount, product_category))

    return data


In [4]:
# Generate 100,000 rows of transaction data
data = generate_data(100_000)

# Convert to a Pandas DataFrame
columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']
df = pd.DataFrame(data, columns=columns)

# Convert to a PySpark DataFrame
spark_df = spark.createDataFrame(df)
spark_df.show(5)


+----------+-------------+-------------------+------+---------------+
|CustomerID|TransactionID|    TransactionDate|Amount|ProductCategory|
+----------+-------------+-------------------+------+---------------+
|    C00012|       T36462|2023-05-05 00:00:00| 90.91|      Furniture|
|    C00037|       T81031|2023-03-19 00:00:00|465.54|    Electronics|
|    C00092|       T98628|2023-02-25 00:00:00| 180.9|       Clothing|
|    C00050|       T46850|2023-04-16 00:00:00|494.67|      Furniture|
|    C00097|       T79766|2023-04-11 00:00:00|179.65|      Groceries|
+----------+-------------+-------------------+------+---------------+
only showing top 5 rows



## 3. Writing DataFrames to Parquet Files

In [5]:
# Specify the path to the Parquet file
output_path = "transactions.parquet"

# Write the DataFrame to Parquet format
spark_df.write.parquet(output_path)


In [6]:
! ls

sample_data  transactions.parquet


## 4. Writing Partitioned Parquet Files


In [7]:
# Write the dataframe to Parquet format, partitioned by 'ProductCategory'
partitioned_output_path = "transactions_partitioned.parquet"
spark_df.write.partitionBy("ProductCategory").parquet(partitioned_output_path)


In [8]:
! ls

sample_data  transactions.parquet  transactions_partitioned.parquet


In [10]:
! ls transactions_partitioned.parquet

'ProductCategory=Books'     'ProductCategory=Electronics'  'ProductCategory=Groceries'
'ProductCategory=Clothing'  'ProductCategory=Furniture'     _SUCCESS


## 5. Reading Parquet Files

In [11]:
# Read in the Parquet file
df_read = spark.read.parquet(output_path)

# Show the content of the DataFrame
df_read.show(5)


+----------+-------------+-------------------+------+---------------+
|CustomerID|TransactionID|    TransactionDate|Amount|ProductCategory|
+----------+-------------+-------------------+------+---------------+
|    C00012|       T36462|2023-05-05 00:00:00| 90.91|      Furniture|
|    C00037|       T81031|2023-03-19 00:00:00|465.54|    Electronics|
|    C00092|       T98628|2023-02-25 00:00:00| 180.9|       Clothing|
|    C00050|       T46850|2023-04-16 00:00:00|494.67|      Furniture|
|    C00097|       T79766|2023-04-11 00:00:00|179.65|      Groceries|
+----------+-------------+-------------------+------+---------------+
only showing top 5 rows

