In [8]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RDD').getOrCreate()
# Creates Empty RDD
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD)

# Creates Empty RDD using parallelize
rdd2 = spark.sparkContext.parallelize([])
print(rdd2)
## Create Empty DataFrame with Schema (StructType)
# Create Schema
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
spark = SparkSession.builder.appName('DataFrame').getOrCreate()
schema1 = StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True),
StructField('age',IntegerType(),True)
])

# Create empty DataFrame from empty RDD
df = spark.createDataFrame(emptyRDD, schema)
df.printSchema()


# Inserting sample data
input1 = [('Ram','Krishna','M',30),
       ('Sita','Ram','M',23)]
df1 = spark.createDataFrame(data=input1, schema=schema1)
df1.printSchema()
df1.show()


EmptyRDD[26] at emptyRDD at NativeMethodAccessorImpl.java:0
ParallelCollectionRDD[27] at readRDDFromFile at PythonRDD.scala:274
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- age: integer (nullable = true)

+---------+----------+--------+---+
|firstname|middlename|lastname|age|
+---------+----------+--------+---+
|      Ram|   Krishna|       M| 30|
|     Sita|       Ram|       M| 23|
+---------+----------+--------+---+



In [9]:
# Convert empty RDD to DataFrame
dfe = emptyRDD.toDF(schema)
dfe.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



In [11]:
# Create empty DataFrame directly
df2 = spark.createDataFrame([], schema)
df2.printSchema()


# Create empty DataFrame with no schema (no columns)
df3 = spark.createDataFrame([], StructType([]))
df3.printSchema()


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

root



In [12]:
#create dataframe using RDD
dept = [("Finance", 10),("Marketing", 20), ("Sales", 30), ("IT", 40)]
rdd = spark.sparkContext.parallelize(dept)
df = rdd.toDF()
df.printSchema()
df.show(truncate = False)

In [15]:
#adding column names to data
deptColumns = ["dept_name", "dept_id"]
df2 = rdd.toDF(deptColumns)
df2.show(truncate = False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [16]:
# Convert DataFrames to Pandas
data = [("James", "", "Smith", "36636", "M", 60000),
("Michael", "Rose", "", "40288", "M", 70000),
("Robert","", "Williams", "42114", "", 400000),
("Jen", "Mary", "Brown", "", "F", 0)]
columns = ["first_name", "middle_name", "last_name", "dob", "gender", "salary"]
df=spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



In [18]:
conv = df.toPandas()
print(conv)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3        Jen        Mary     Brown             F       0


In [19]:
# Converting Struct Schema to Pandas

dataStruct = [(("James", "", "Smith"), "36636", "M", "3000"), \
(("Michael", "Rose", ""), "40288", "M", "4000"), \
(("Robert", "", "Williams"), "42114", "M", "4000"), \
(("Marie", "Anne", "Jones"), "39192", "F", "4000"), \
(("Jen", "Mary", "Brown"), "", "F", "-1") \
]
schemaStruct = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', StringType(), True)
])
df = spark.createDataFrame(data = dataStruct, schema = schemaStruct)
df.printSchema()
pandasDF2 = df.toPandas()
print(pandasDF2)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

                   name    dob gender salary
0      (James, , Smith)  36636      M   3000
1     (Michael, Rose, )  40288      M   4000
2  (Robert, , Williams)  42114      M   4000
3  (Marie, Anne, Jones)  39192      F   4000
4    (Jen, Mary, Brown)             F     -1
