In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [4]:
df = spark.read.json('data/people.json')

In [7]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [12]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
######## Creating Custom schema (if spark does not infer data types)
from pyspark.sql.types import StructField,IntegerType,StringType,StructType

In [11]:
data_schema = [StructField('age',IntegerType(),True),
              StructField('name',StringType(),True)]

final_schema = StructType(fields=data_schema)
print(final_schema)

StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))


In [13]:
df_new = spark.read.json('data/people.json',schema=final_schema)
df_new.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [17]:
############ Selecting Columns and rows from datafraes 
### Columns
# 1) extract columns
col_age = df['age']
print(type(col_age))
print(col_age)

# 2) extract columns as dfs (more useful)
df_age = df.select('age')
print(type(df_age))
df_age.show()

<class 'pyspark.sql.column.Column'>
Column<'age'>
<class 'pyspark.sql.dataframe.DataFrame'>
+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [18]:
# selecting multiplt columns
# df_mul = df.select(['age','name'])
# df_all = df.select('*')

In [21]:
#### Rows
rows_ = df.head(2)
print(type(rows_))
print(rows_)
print(rows_[0])

<class 'list'>
[Row(age=None, name='Michael'), Row(age=30, name='Andy')]
Row(age=None, name='Michael')


In [23]:
####### Column ops 
# 1) Creating new column
df1 = df.withColumn('new_age',df['age']*2)
df1.show()

+----+-------+-------+
| age|   name|new_age|
+----+-------+-------+
|null|Michael|   null|
|  30|   Andy|     60|
|  19| Justin|     38|
+----+-------+-------+



In [24]:
# 2) renaming columns
df1.withColumnRenamed('new_age','double_age').show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [29]:
############### SPARK SQL #############
df.createOrReplaceTempView('people')
res = spark.sql("SELECT * FROM people WHERE age=30")
res.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

