In [1]:
# !pip install findspark

In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\apache-spark\\spark-3.0.0-bin-hadoop2.7'

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark  = SparkSession.builder.appName('Basics').getOrCreate()

In [11]:
df = spark.read.json('./datasets/people.json')

In [13]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [14]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [15]:
  df.columns

['age', 'name']

In [16]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [17]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



*********** 

To manually set dataframe Schema

In [45]:
#import typetools
from pyspark.sql.types import (StructField,StringType,
                               IntegerType,StructType)

In [46]:
dataSchema = [StructField('age',IntegerType(),True),
             StructField('name',StringType(),True)]

In [47]:
finalStruct = StructType(fields = dataSchema)

In [48]:
df = spark.read.json('./datasets/people.json', schema = finalStruct)

In [49]:
df

DataFrame[age: int, name: string]

In [50]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



**** 

To get a column

In [51]:
df['age'] #wont work - coz : 

Column<b'age'>

In [52]:
type(df['age']) #we will get a column object. 

pyspark.sql.column.Column

In [53]:
df.select('age') #now we got a dataframe with one column. To see results :

DataFrame[age: int]

In [54]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [55]:
df.head(2) #list of row objects

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [56]:
df.head(2)[0]

Row(age=None, name='Michael')

In [57]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [58]:
df.select(['age','name'])

DataFrame[age: int, name: string]

In [59]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [60]:
df.withColumn('newage',df['age']*2).show() #not inplace operation

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [61]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [62]:
v = df.withColumn('newage',df['age']*2)

In [63]:
v.show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [64]:
v.withColumnRenamed('newage','doubleage').show() #rename a column

+----+-------+---------+
| age|   name|doubleage|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [66]:
v.show() #rename is not inplace operation

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [67]:
df.createOrReplaceTempView('people') #to create a temp view using SQL

In [68]:
results = spark.sql("Select * from people")

In [69]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [70]:
result2 = spark.sql("Select * from people where age > 18")

In [71]:
result2.show()

+---+------+
|age|  name|
+---+------+
| 30|  Andy|
| 19|Justin|
+---+------+

