In [1]:
from pyspark.sql import SparkSession, Row
from datetime import datetime, date
import pandas as pd
import time

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/12 22:01:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Create PySpark dataframe from list of Rows

In [4]:
df = spark.createDataFrame([
    Row(a=1, b = 1., c='string1', d=date(2023, 1, 1), e=datetime(2023, 1, 1, 12, 0)),    
    Row(a=2, b = 2., c='string2', d=date(2023, 1, 2), e=datetime(2023, 1, 2, 12, 0)),    
    Row(a=3, b = 3., c='string3', d=date(2023, 1, 3), e=datetime(2023, 1, 3, 12, 0))
])
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|string1|2023-01-01|2023-01-01 12:00:00|
|  2|2.0|string2|2023-01-02|2023-01-02 12:00:00|
|  3|3.0|string3|2023-01-03|2023-01-03 12:00:00|
+---+---+-------+----------+-------------------+



Create PySpark dataframe with explicit schema

In [5]:
df = spark.createDataFrame([
    Row(a=1, b= 1., c= 'String1', d = date(2023, 1, 1), e=datetime(2023,1,1,12,0)),
    Row(a=2, b= 2., c= 'String2', d = date(2023, 2, 1), e=datetime(2023,1,2,12,0)),
    Row(a=3, b= 3., c= 'String3', d = date(2023, 3, 1), e=datetime(2023,1,3,12,0))],
    schema = 'a long, b double, c string, d date, e timestamp'
)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

Create PySpark dataframe with pandas dataframe

In [6]:
pandas_df = pd.DataFrame(
    {
        'a': [1,2,3],
        'b': [1., 2., 3.],
        'c': ['String1', 'String2', 'String3'],
        'd': [date(2023,1,1), date(2023,1,2), date(2023,1,3)],
        'e': [datetime(2023,1,1,12,0), datetime(2023,1,2,12,0), datetime(2023,1,3,12,0)]
    }
)

df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [18]:
data = [[1, 1., 'String1', date(2023,1,1), datetime(2023,1,1,12,0)],
        [2, 2., 'String2', date(2023,1,2), datetime(2023,1,2,12,0)],
        [3, 3., 'String3', date(2023,1,3), datetime(2023,1,3,12,0)],
        [4, 4., 'String4', date(2023,1,4), datetime(2023,1,4,12,0)]]

column = ['a', 'b', 'c', 'd', 'e']
df = spark.createDataFrame(data, column)
df.show()


+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  2|2.0|String2|2023-01-02|2023-01-02 12:00:00|
|  3|3.0|String3|2023-01-03|2023-01-03 12:00:00|
|  4|4.0|String4|2023-01-04|2023-01-04 12:00:00|
+---+---+-------+----------+-------------------+



                                                                                

Dataframe display using show method

In [19]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  2|2.0|String2|2023-01-02|2023-01-02 12:00:00|
|  3|3.0|String3|2023-01-03|2023-01-03 12:00:00|
|  4|4.0|String4|2023-01-04|2023-01-04 12:00:00|
+---+---+-------+----------+-------------------+



In [20]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,1.0,String1,2023-01-01,2023-01-01 12:00:00
2,2.0,String2,2023-01-02,2023-01-02 12:00:00
3,3.0,String3,2023-01-03,2023-01-03 12:00:00
4,4.0,String4,2023-01-04,2023-01-04 12:00:00


In [21]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 1.0                 
 c   | String1             
 d   | 2023-01-01          
 e   | 2023-01-01 12:00:00 
only showing top 1 row



In [22]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [23]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [24]:
#summary of dataframe
df.select('a', 'b', 'c').describe().show()



+-------+------------------+------------------+-------+
|summary|                 a|                 b|      c|
+-------+------------------+------------------+-------+
|  count|                 4|                 4|      4|
|   mean|               2.5|               2.5|   null|
| stddev|1.2909944487358056|1.2909944487358056|   null|
|    min|                 1|               1.0|String1|
|    max|                 4|               4.0|String4|
+-------+------------------+------------------+-------+



                                                                                

DataFrame.collect() for collecting distributed data from the executors and bring it to the driver. Can throw out of memory exception when the dataset is too large to fit in the driver side, as all the data from the executors is collected.

In [25]:
df.collect()

[Row(a=1, b=1.0, c='String1', d=datetime.date(2023, 1, 1), e=datetime.datetime(2023, 1, 1, 12, 0)),
 Row(a=2, b=2.0, c='String2', d=datetime.date(2023, 1, 2), e=datetime.datetime(2023, 1, 2, 12, 0)),
 Row(a=3, b=3.0, c='String3', d=datetime.date(2023, 1, 3), e=datetime.datetime(2023, 1, 3, 12, 0)),
 Row(a=4, b=4.0, c='String4', d=datetime.date(2023, 1, 4), e=datetime.datetime(2023, 1, 4, 12, 0))]

In [27]:
df.take(2)
df.tail(1)

[Row(a=4, b=4.0, c='String4', d=datetime.date(2023, 1, 4), e=datetime.datetime(2023, 1, 4, 12, 0))]

toPandas() also takes all the data from executors and brings it to the driver. So this could also result in out of memory exception.

In [29]:
df

a,b,c,d,e
1,1.0,String1,2023-01-01,2023-01-01 12:00:00
2,2.0,String2,2023-01-02,2023-01-02 12:00:00
3,3.0,String3,2023-01-03,2023-01-03 12:00:00
4,4.0,String4,2023-01-04,2023-01-04 12:00:00


In [32]:
from pyspark.sql.functions import date_format

df_pandas = df.withColumn('e', date_format('e', 'yyyy-MM-dd HH:mm:ss')).toPandas()
df_pandas

                                                                                

Unnamed: 0,a,b,c,d,e
0,1,1.0,String1,2023-01-01,2023-01-01 12:00:00
1,2,2.0,String2,2023-01-02,2023-01-02 12:00:00
2,3,3.0,String3,2023-01-03,2023-01-03 12:00:00
3,4,4.0,String4,2023-01-04,2023-01-04 12:00:00


In [35]:
print(type(df))
print(type(df_pandas))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Selecting and accessing data

In [36]:
df.a

Column<'a'>

In [37]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

In [38]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|String1|
|String2|
|String3|
|String4|
+-------+



In [39]:
df.withColumn('upper_c', upper('c')).show()
df.withColumn('upper_c', upper('c')).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|STRING1|
|  2|2.0|String2|2023-01-02|2023-01-02 12:00:00|STRING2|
|  3|3.0|String3|2023-01-03|2023-01-03 12:00:00|STRING3|
|  4|4.0|String4|2023-01-04|2023-01-04 12:00:00|STRING4|
+---+---+-------+----------+-------------------+-------+



In [41]:
df.filter(df.a == 2).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  2|2.0|String2|2023-01-02|2023-01-02 12:00:00|
+---+---+-------+----------+-------------------+

