# Basic Introduction to Spark DataFrames

In [4]:
# Import the SparkSession object
from pyspark.sql.session import SparkSession

In [2]:
# Creating a spark session
spark = SparkSession.builder.appName('df_basics').getOrCreate()

In [7]:
# Default path to the file folder
file_path = "file:///var/lib/spark/jupyter/data/"
df = spark.read.json(file_path+"people.json")

In [9]:
# Show data frame in spark
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [10]:
# Print schema details
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [11]:
# List all columns
df.columns

['age', 'name']

In [12]:
# Show the describe data frame
df.describe()

DataFrame[summary: string, age: string, name: string]

In [13]:
# See the description (automatically shows the statistics)
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [31]:
# Define a schema using StructType, StructField, and other dataTypes
from pyspark.sql.session import SparkSession
from pyspark.sql.types import (StructType, StructField,  
                               StringType, IntegerType)
spark = SparkSession.builder.appName('schema_changes').getOrCreate()

In [36]:
# Create the list of schema fields for each column using StructField method
data_schema = [StructField(name = 'age', dataType = IntegerType(), nullable = True), StructField(name = 'name', dataType = StringType(), nullable=True)]

In [37]:
# Create final structure using StructType method
final_struct = StructType(data_schema)

In [40]:
# Create data frame by defining the schema as StructType
new_df = spark.read.json('../data/people.json', schema = final_struct)

In [44]:
# Display the schema as defined
new_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [45]:
new_df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [46]:
new_df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [48]:
# See the column object
df['age']

Column<'age'>

In [49]:
# See the column object type
type(df['age'])

pyspark.sql.column.Column

In [51]:
# Select a particular column value and show it
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [53]:
# Select a particular column value type i,e DataFrame type
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [54]:
# List the top two rows as list of rows
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [55]:
# See the first row element
df.head(2)[0]

Row(age=None, name='Michael')

In [57]:
# See the first row element type 
type(df.head(2)[0])

pyspark.sql.types.Row

In [58]:
# Select multiple columns
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [68]:
# Add a new column to the datafram using withColumn (NOT A INPLACE OPERATION)
df.withColumn('half_age', df['age']/2).show()

+----+-------+--------+
| age|   name|half_age|
+----+-------+--------+
|null|Michael|    null|
|  30|   Andy|    15.0|
|  19| Justin|     9.5|
+----+-------+--------+



In [66]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [67]:
# Rename a column
df.withColumnRenamed('age', 'new_age').show()

+-------+-------+
|new_age|   name|
+-------+-------+
|   null|Michael|
|     30|   Andy|
|     19| Justin|
+-------+-------+



In [69]:
# Create a view from a data frame
df.createOrReplaceTempView('people')


In [70]:
# Use spark sql to fetch the data from the view
result = spark.sql('SELECT * FROM PEOPLE')

In [71]:
result.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [74]:
# More complex sql
new_results = spark.sql('SELECT name, age, rank() over(partition by name order by age desc) as rn FROM PEOPLE WHERE AGE = 30')

In [76]:
new_results.show()

+----+---+---+
|name|age| rn|
+----+---+---+
|Andy| 30|  1|
+----+---+---+

