In [1]:
from pyspark.sql import SparkSession, Row
from datetime import datetime, date
import pandas as pd
import time

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/10 19:18:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Create PySpark dataframe from list of Rows

In [3]:
df = spark.createDataFrame([
    Row(a=1, b = 1., c='string1', d=date(2023, 1, 1), e=datetime(2023, 1, 1, 12, 0)),    
    Row(a=2, b = 2., c='string2', d=date(2023, 1, 2), e=datetime(2023, 1, 2, 12, 0)),    
    Row(a=3, b = 3., c='string3', d=date(2023, 1, 3), e=datetime(2023, 1, 3, 12, 0))
])
print(df.show())

                                                                                

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|string1|2023-01-01|2023-01-01 12:00:00|
|  2|2.0|string2|2023-01-02|2023-01-02 12:00:00|
|  3|3.0|string3|2023-01-03|2023-01-03 12:00:00|
+---+---+-------+----------+-------------------+

None


Create PySpark dataframe with explicit schema

In [4]:
df = spark.createDataFrame([
    Row(a=1, b= 1., c= 'String1', d = date(2023, 1, 1), e=datetime(2023,1,1,12,0)),
    Row(a=2, b= 2., c= 'String2', d = date(2023, 2, 1), e=datetime(2023,1,2,12,0)),
    Row(a=3, b= 3., c= 'String3', d = date(2023, 3, 1), e=datetime(2023,1,3,12,0))],
    schema = 'a long, b double, c string, d date, e timestamp'
)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

Create PySpark dataframe with pandas dataframe

In [5]:
pandas_df = pd.DataFrame(
    {
        'a': [1,2,3],
        'b': [1., 2., 3.],
        'c': ['String1', 'String2', 'String3'],
        'd': [date(2023,1,1), date(2023,1,2), date(2023,1,3)],
        'e': [datetime(2023,1,1,12,0), datetime(2023,1,2,12,0), datetime(2023,1,3,12,0)]
    }
)

df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [10]:
data = [[1, 1., 'String1', date(2023,1,1), datetime(2023,1,1,12,0)],
        [1, 1., 'String1', date(2023,1,1), datetime(2023,1,1,12,0)],
        [1, 1., 'String1', date(2023,1,1), datetime(2023,1,1,12,0)],
        [1, 1., 'String1', date(2023,1,1), datetime(2023,1,1,12,0)]]

column = ['a', 'b', 'c', 'd', 'e']
df = spark.createDataFrame(data, column)
df.show()


                                                                                

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
+---+---+-------+----------+-------------------+



23/10/09 23:05:36 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 136600 ms exceeds timeout 120000 ms
23/10/09 23:05:36 WARN SparkContext: Killing executors is not supported by current scheduler.
23/10/09 23:05:36 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.B

Dataframe display using show method

In [6]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|1.0|String1|2023-01-01|2023-01-01 12:00:00|
|  2|2.0|String2|2023-01-02|2023-01-02 12:00:00|
|  3|3.0|String3|2023-01-03|2023-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [8]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,1.0,String1,2023-01-01,2023-01-01 12:00:00
2,2.0,String2,2023-01-02,2023-01-02 12:00:00
3,3.0,String3,2023-01-03,2023-01-03 12:00:00


In [9]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 1.0                 
 c   | String1             
 d   | 2023-01-01          
 e   | 2023-01-01 12:00:00 
only showing top 1 row



In [10]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [12]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [13]:
#summary of dataframe
df.select('a', 'b', 'c').describe().show()

[Stage 8:>                                                          (0 + 4) / 4]

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|2.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|1.0|String1|
|    max|  3|3.0|String3|
+-------+---+---+-------+



                                                                                