In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
spark = SparkSession.builder.appName('basic').getOrCreate()

## Creating a empty dataframe

### Parallelize way

In [3]:
# Define your data by a set of data. Each data is a Row!
data = [("This","is only", "a test!"),("And this","is","too")]

# Paralellize data
spark.sparkContext.parallelize(data).toDF().show()


+--------+-------+-------+
|      _1|     _2|     _3|
+--------+-------+-------+
|    This|is only|a test!|
|And this|     is|    too|
+--------+-------+-------+



In [4]:
# Defining column names
columns = ["colA", "colB", "colC"]
spark.sparkContext.parallelize(data).toDF(columns).show()

+--------+-------+-------+
|    colA|   colB|   colC|
+--------+-------+-------+
|    This|is only|a test!|
|And this|     is|    too|
+--------+-------+-------+



### createDataFrame way

In [5]:
# First it's required unleast a empty schema
schema = StructType([])

# Now, an empty and useless dataframe
spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()

++
||
++
++



In [11]:
# Schema for previous data
schema = StructType([
            StructField("colA",StringType(),False),
            StructField("colB",StringType(),False),
            StructField("colC",StringType(),False)
        ])
# Creating DataFrame
spark.createDataFrame(data,schema).show()


+--------+-------+-------+
|    colA|   colB|   colC|
+--------+-------+-------+
|    This|is only|a test!|
|And this|     is|    too|
+--------+-------+-------+



In [12]:
# Minimalist
df = spark.createDataFrame([["This","is only","a test"],["And this","is","too"]],schema)
df.show()

+--------+-------+------+
|    colA|   colB|  colC|
+--------+-------+------+
|    This|is only|a test|
|And this|     is|   too|
+--------+-------+------+



## Dataframes from CSV

In [13]:
# Let's create a file first from the dataframe stored on 'df' var!
file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'
df.write.csv(file, sep=',', header=True)


# Reading from HDFS
spark.read.csv(file, 
               sep=',',
               encoding='utf-8',
               header=True).show()

+--------+-------+------+
|    colA|   colB|  colC|
+--------+-------+------+
|    This|is only|a test|
|And this|     is|   too|
+--------+-------+------+

