In [1]:
from pyspark.sql import Row, SparkSession
from datetime import date, datetime

import findspark
findspark.init()

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(
    [
        Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
        Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 2, 1, 12, 0)),
        Row(a=3, b=4., c='string3', d=date(2000, 3, 1), e=datetime(2000, 3, 1, 12, 0))
    ],
    schema = 'a long, b double, c string, d date, e timestamp'
)

df.show(vertical=True)
df.printSchema()

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
-RECORD 1------------------
 a   | 2                   
 b   | 3.0                 
 c   | string2             
 d   | 2000-02-01          
 e   | 2000-02-01 12:00:00 
-RECORD 2------------------
 a   | 3                   
 b   | 4.0                 
 c   | string3             
 d   | 2000-03-01          
 e   | 2000-03-01 12:00:00 

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [2]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [3]:
df.select('a','b','c').describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [4]:
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [6]:
df.a

Column<'a'>

In [14]:
from pyspark.sql.functions import upper

df.c, upper(df.c), df.c.isNull()

(Column<'c'>, Column<'upper(c)'>, Column<'(c IS NULL)'>)

In [15]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [17]:
df = df.withColumn('upper_c', upper(df.c))
df.show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-02-01 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-03-01 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [18]:
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
+---+---+-------+----------+-------------------+-------+

