## Install PySpark

In [1]:
! pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bbfd80a589ea8e2302f3938fd11b4434a84633b28244a0229ecf62245ae601d1
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


## 1. Start a PySpark Session


In [2]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
	.appName("ReadCSV") \
	.getOrCreate()


## 2. Generate a Sample CSV File

In [3]:
import random
import pandas as pd

# Function to generate random transaction data
def generate_data(n):
    customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]
    product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']

    data = []
    for _ in range(n):
        customer_id = random.choice(customer_ids)
        transaction_id = f'T{str(random.randint(10000, 99999))}'
        transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')
        amount = round(random.uniform(5, 500), 2)
        product_category = random.choice(product_categories)
        data.append((customer_id, transaction_id, transaction_date, amount, product_category))

    return data

# Generate 10000 rows of transaction data
data = generate_data(10_000)

# Convert to a Pandas DataFrame
columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']
df = pd.DataFrame(data, columns=columns)

# Create the CSV file
csv_path = "sample_transactions.csv"
df.to_csv(csv_path, index=False)

print(f"Sample CSV file '{csv_path}' generated.")

Sample CSV file 'sample_transactions.csv' generated.


## 3. Read the CSV File into a PySpark DataFrame

In [4]:
spark_df = spark.read.csv(csv_path, header=True, inferSchema=True)

# Show the first 5 rows
spark_df.show(5)


+----------+-------------+---------------+------+---------------+
|CustomerID|TransactionID|TransactionDate|Amount|ProductCategory|
+----------+-------------+---------------+------+---------------+
|    C00006|       T58996|     2023-01-09| 17.02|      Furniture|
|    C00076|       T30519|     2023-02-28|459.67|          Books|
|    C00076|       T89246|     2023-06-10|404.95|       Clothing|
|    C00049|       T11436|     2023-06-05| 103.9|          Books|
|    C00049|       T18176|     2023-04-03|406.55|      Furniture|
+----------+-------------+---------------+------+---------------+
only showing top 5 rows



## 4. Exploring the DataFrame

In [10]:
# Print the schema of the DataFrame
spark_df.printSchema()


root
 |-- CustomerID: string (nullable = true)
 |-- TransactionID: string (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- Amount: double (nullable = true)
 |-- ProductCategory: string (nullable = true)



In [11]:
from pyspark.sql.functions import col

# Filter transactions with an Amount greater than 100
filtered_df = spark_df.filter(col("Amount") > 100)

# Select specific columns
selected_df = filtered_df.select("CustomerID", "TransactionID", "Amount")

# Show the results
selected_df.show(5)

+----------+-------------+------+
|CustomerID|TransactionID|Amount|
+----------+-------------+------+
|    C00076|       T30519|459.67|
|    C00076|       T89246|404.95|
|    C00049|       T11436| 103.9|
|    C00049|       T18176|406.55|
|    C00096|       T31087|349.47|
+----------+-------------+------+
only showing top 5 rows

