# <u>Imports</u>

In [6]:
import pyspark
from pyspark.sql import Row
import pyspark.sql.types as T
import pyspark.sql.functions as F

# Spark Context

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# <u>Creating DataFrames</u>

## Defined Schema

In [9]:
schema = T.StructType([
    T.StructField("string_field", T.StringType(), True),
    T.StructField("integer_field", T.IntegerType(), True),
    T.StructField("float_field", T.DoubleType(), True),
    T.StructField("boolean_field", T.BooleanType(), True),
    T.StructField("array_field", T.ArrayType(T.StringType()), True),
    T.StructField("struct_field", T.StructType([
        T.StructField("sub_field", T.StringType(), True)
    ]))
])
df_1 = spark.createDataFrame(
    [
        ["a", 1, 1.1, True, ["b"], {"sub_field": "c"}],
        ["d", 2, 2.1, False, ["e", "f"], {"sub_field": "g"}]
    ],
    schema
)
display(df_1.toPandas())

Unnamed: 0,string_field,integer_field,float_field,boolean_field,array_field,struct_field
0,a,1,1.1,True,[b],"(c,)"
1,d,2,2.1,False,"[e, f]","(g,)"


## Inferred Schema

In [10]:
data = [Row(name="Alice", age=25, city="New York"),
        Row(name="Bob", age=30, city="San Francisco"),
        Row(name="Charlie", age=35, city="Los Angeles")]
df_2 = spark.createDataFrame(data)
display(df_2.toPandas())

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Los Angeles


# <u>Querying DataFrames</u>

## Select

In [14]:
result = df_2.select("name", "age")
result.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



## Where

In [19]:
result = df_2.where(df_2.age > 30)
result.show()

+-------+---+-----------+
|   name|age|       city|
+-------+---+-----------+
|Charlie| 35|Los Angeles|
+-------+---+-----------+



In [20]:
result = df_1.where(F.array_contains(df_1.array_field, "e"))
result.show()

+------------+-------------+-----------+-------------+-----------+------------+
|string_field|integer_field|float_field|boolean_field|array_field|struct_field|
+------------+-------------+-----------+-------------+-----------+------------+
|           d|            2|        2.1|        false|     [e, f]|         {g}|
+------------+-------------+-----------+-------------+-----------+------------+



In [21]:
result = df_2.where(df_2.city.contains("York"))
result.show()

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
|Alice| 25|New York|
+-----+---+--------+



## OrderBy

In [16]:
result = df_2.orderBy("age")
result.show()

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 25|     New York|
|    Bob| 30|San Francisco|
|Charlie| 35|  Los Angeles|
+-------+---+-------------+



## Distinct

In [22]:
result = df_2.select("city").distinct()
result.show()

+-------------+
|         city|
+-------------+
|     New York|
|San Francisco|
|  Los Angeles|
+-------------+



## Count

In [24]:
result = df_1.count()
print(result)

2


## Limit

In [25]:
result = df_2.limit(2)
result.show()

+-----+---+-------------+
| name|age|         city|
+-----+---+-------------+
|Alice| 25|     New York|
|  Bob| 30|San Francisco|
+-----+---+-------------+



## Retrieve Value

In [34]:
result = df_2.where(df_2.name == "Alice").select("city").collect()[0][0]
print(result)

New York


In [36]:
result = df_2.where(df_2.name == "Alice").first()["city"]
print(result)

New York


## ToPandas

In [26]:
result = df_1.toPandas()
print(result)

  string_field  integer_field  float_field  boolean_field array_field  \
0            a              1          1.1           True         [b]   
1            d              2          2.1          False      [e, f]   

  struct_field  
0         (c,)  
1         (g,)  


## ToJSON

In [32]:
result = df_1.toJSON().collect()
print(result)

['{"string_field":"a","integer_field":1,"float_field":1.1,"boolean_field":true,"array_field":["b"],"struct_field":{"sub_field":"c"}}', '{"string_field":"d","integer_field":2,"float_field":2.1,"boolean_field":false,"array_field":["e","f"],"struct_field":{"sub_field":"g"}}']


# <u>DataFrame Manipulations</u>

## WithColumn

## DropDuplicates

## Explode

## GroupBy

## Pivot

## Drop

# <u>DataFrame Operations</u>

# Join

## Union