In [0]:
df1 = spark.range(5)
df1.printSchema()

root
 |-- id: long (nullable = false)



In [0]:
from datetime import date, datetime

data_list = [
    (1,2.,'string1', date(2022,1,1),datetime(2022,1,1,12,0)),
    (2,3.,'string2', date(2022,2,1), datetime(2022,1,2,12,0)),
    (3,4.,'string3', date(2022,3,1), datetime(2022,1,2,12,0))
]


# Create DF based on list and automatically schema detection

In [0]:
df1 = spark.createDataFrame(data_list).toDF("a", "b", "c", "d", "e")
display(df1)

a,b,c,d,e
1,2.0,string1,2022-01-01,2022-01-01T12:00:00.000+0000
2,3.0,string2,2022-02-01,2022-01-02T12:00:00.000+0000
3,4.0,string3,2022-03-01,2022-01-02T12:00:00.000+0000


In [0]:
df1.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)




# Enforce Schema

In [0]:
schema_1 = ['a', 'b', 'c', 'd', 'e']
schema_2 = 'a int, b double, c string, d date, e timestamp'

In [0]:
spark.createDataFrame(data_list, schema_1).printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [0]:
spark.createDataFrame(data_list, schema_2).printSchema()

root
 |-- a: integer (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)




# Using Row

In [0]:
from pyspark.sql import Row

row_list = [Row(a=1, b=2., c='string1', d=date(2022,1,1), e=datetime(2022,1,1,12,0)),
            Row(a=2, b=3., c='string2', d=date(2022,2,1), e=datetime(2022,2,1,12,0)),
            Row(a=3, b=4., c='string3', d=date(2022,3,1), e=datetime(2022,3,1,12,0))]


spark.createDataFrame(row_list).printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)




# Using Pandas DF

In [0]:
import pandas as pd

pd_df = pd.DataFrame({'a':[1,2,3]})
spark.createDataFrame(pd_df).printSchema()

root
 |-- a: long (nullable = true)




# Using RDD

In [0]:
rdd = spark.sparkContext.parallelize([1,2., 'string1'])
spark.createDataFrame(rdd, schema_1).printSchema()