In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('App').getOrCreate()

23/06/05 23:00:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
df = spark.read.json('iris.json')
df.show()

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|              [|       null|      null|       null|      null|   null|
|           null|        1.4|       0.2|        5.1|       3.5| setosa|
|           null|        1.4|       0.2|        4.9|       3.0| setosa|
|           null|        1.3|       0.2|        4.7|       3.2| setosa|
|           null|        1.5|       0.2|        4.6|       3.1| setosa|
|           null|        1.4|       0.2|        5.0|       3.6| setosa|
|           null|        1.7|       0.4|        5.4|       3.9| setosa|
|           null|        1.4|       0.3|        4.6|       3.4| setosa|
|           null|        1.5|       0.2|        5.0|       3.4| setosa|
|           null|        1.4|       0.2|        4.4|       2.9| setosa|
|           null|        1.5|       0.1|        4.9|       3.1| 

In [6]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- species: string (nullable = true)



In [8]:
df.columns

['_corrupt_record',
 'petalLength',
 'petalWidth',
 'sepalLength',
 'sepalWidth',
 'species']

In [10]:
df.describe().show()

23/06/05 23:18:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+---------------+------------------+------------------+------------------+-------------------+---------+
|summary|_corrupt_record|       petalLength|        petalWidth|       sepalLength|         sepalWidth|  species|
+-------+---------------+------------------+------------------+------------------+-------------------+---------+
|  count|              2|               150|               150|               150|                150|      150|
|   mean|           null|3.7580000000000027| 1.199333333333334| 5.843333333333335|  3.057333333333334|     null|
| stddev|           null|1.7652982332594662|0.7622376689603467|0.8280661279778637|0.43586628493669793|     null|
|    min|              [|               1.0|               0.1|               4.3|                2.0|   setosa|
|    max|              ]|               6.9|               2.5|               7.9|                4.4|virginica|
+-------+---------------+------------------+------------------+------------------+--------------

In [11]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [13]:
data_schema = [StructField('petalLength', IntegerType(),True),
               StructField('petalWidth', IntegerType(),True),
               StructField('sepalLength', IntegerType(),True),
               StructField('sepalWidth', IntegerType(),True),
               StructField('species', StringType(),True),]

In [14]:
final_struct = StructType(fields = data_schema)

In [15]:
new_df = spark.read.json('iris.json', schema=final_struct)
new_df.printSchema()

root
 |-- petalLength: integer (nullable = true)
 |-- petalWidth: integer (nullable = true)
 |-- sepalLength: integer (nullable = true)
 |-- sepalWidth: integer (nullable = true)
 |-- species: string (nullable = true)



In [16]:
new_df.select('species').show()

+-------+
|species|
+-------+
|   null|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
| setosa|
+-------+
only showing top 20 rows



In [19]:
df.withColumn('doubleSepalLength', df['sepalLength']*2).show()

+---------------+-----------+----------+-----------+----------+-------+-----------------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|doubleSepalLength|
+---------------+-----------+----------+-----------+----------+-------+-----------------+
|              [|       null|      null|       null|      null|   null|             null|
|           null|        1.4|       0.2|        5.1|       3.5| setosa|             10.2|
|           null|        1.4|       0.2|        4.9|       3.0| setosa|              9.8|
|           null|        1.3|       0.2|        4.7|       3.2| setosa|              9.4|
|           null|        1.5|       0.2|        4.6|       3.1| setosa|              9.2|
|           null|        1.4|       0.2|        5.0|       3.6| setosa|             10.0|
|           null|        1.7|       0.4|        5.4|       3.9| setosa|             10.8|
|           null|        1.4|       0.3|        4.6|       3.4| setosa|              9.2|
|         

In [21]:
df.createOrReplaceTempView('iris')

sepal_greater_5 = spark.sql("SELECT * from iris WHERE sepalLength > 5")
sepal_greater_5.show()

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|           null|        1.4|       0.2|        5.1|       3.5| setosa|
|           null|        1.7|       0.4|        5.4|       3.9| setosa|
|           null|        1.5|       0.2|        5.4|       3.7| setosa|
|           null|        1.2|       0.2|        5.8|       4.0| setosa|
|           null|        1.5|       0.4|        5.7|       4.4| setosa|
|           null|        1.3|       0.4|        5.4|       3.9| setosa|
|           null|        1.4|       0.3|        5.1|       3.5| setosa|
|           null|        1.7|       0.3|        5.7|       3.8| setosa|
|           null|        1.5|       0.3|        5.1|       3.8| setosa|
|           null|        1.7|       0.2|        5.4|       3.4| setosa|
|           null|        1.5|       0.4|        5.1|       3.7| 