# Create a data transformation pipeline with Pyspark

## Basic introduction

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkByExamples.com").getOrCreate()

print(f"Spark version: {spark.version}")

Spark version: 3.4.1


### Reading a CSV file

In [3]:
df = spark.read.option("header",True).csv("datasets/ratings.csv")
df.show(5)

+------------+-------+---------------+-------+
|       brand|  model|absorption_rate|comfort|
+------------+-------+---------------+-------+
|Diapers-R-Us|6months|              2|      3|
|     Nappy-k|2months|              3|      4|
|     Pampers|3months|              4|      4|
|     Huggies|newborn|              3|      5|
+------------+-------+---------------+-------+



### Defining a schema

In [5]:
# define the schema
from pyspark.sql.types import StructType, StructField, StringType, ByteType

schema = StructType([
    StructField("brand", StringType(), False),
    StructField("model", StringType(), False),
    StructField("absorption_rate", ByteType(), True),
    StructField("comfort", ByteType(), True),
])

better_df = spark.read.option("header", True).schema(schema).csv("datasets/ratings.csv")

better_df.dtypes

[('brand', 'string'),
 ('model', 'string'),
 ('absorption_rate', 'tinyint'),
 ('comfort', 'tinyint')]

## Cleaning data

### Removing invalid rows

In [7]:
rattings = spark.read.option(header='true', mode="DROPMALFORMED").csv("datasets/ratings_with_invalid_rows.csv")
ratings.show(5)

TypeError: option() got an unexpected keyword argument 'header'