In [4]:
sc

In [5]:
spark

In [6]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")

In [7]:
people_df.show(5)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|   Iberia|       Peru|    Jarrid|  Male|  4| Dalziell|170398|
| La Ronge|     Canada| Reinaldos|  Male|  5|   Keeffe|440989|
+---------+-----------+----------+------+---+---------+------+
only showing top 5 rows



In [8]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



### 1.Create a User defined Schema for fields of DataFrame

In [9]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType,FloatType,StringType,StructType,StructField

In [10]:
schema = StructType([
    StructField("id",IntegerType(),True),
    StructField("first_name",StringType(),True),
    StructField("last_name",StringType(),True),
    StructField("gender",StringType(),True),
    StructField("salary",FloatType(),True),
    StructField("city",StringType(),True),
    StructField("country",StringType(),True)
])

In [11]:
people_df  = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [12]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [13]:
people_df.show(5)

+---+----------+---------+------+--------+---------+-----------+
| id|first_name|last_name|gender|  salary|     city|    country|
+---+----------+---------+------+--------+---------+-----------+
|  1|     Valma|     Sans|Female|983107.0|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173.0|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898.0|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398.0|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989.0| La Ronge|     Canada|
+---+----------+---------+------+--------+---------+-----------+
only showing top 5 rows



In [14]:
#multiline - used to read json files having newlines
bank_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json",multiLine = True)  

In [15]:
bank_data.show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [16]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



### 2.TypeCasting any one Column 

In [17]:
bank_data.withColumn('age',col('age').cast(IntegerType()))

DataFrame[age: int, balance: bigint, campaign: bigint, contact: string, day: bigint, default: string, duration: bigint, education: string, housing: string, job: string, loan: string, marital: string, month: string, pdays: bigint, poutcome: string, previous: bigint, y: string]

### 3.Creating New Column from two Strings

In [18]:
from pyspark.sql.functions import concat
people_df.withColumn('Full_Name',concat(col('first_name'),lit(" "),col('last_name'))).show(5)  #update the variable for the changes to be updates


+---+----------+---------+------+--------+---------+-----------+----------------+
| id|first_name|last_name|gender|  salary|     city|    country|       Full_Name|
+---+----------+---------+------+--------+---------+-----------+----------------+
|  1|     Valma|     Sans|Female|983107.0|Mulyosari|  Indonesia|      Valma Sans|
|  2|     Paolo|   Kiddie|  Male|649173.0|  Niihama|      Japan|    Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|352898.0|Dū Qal‘ah|Afghanistan| Miltie De Zuani|
|  4|    Jarrid| Dalziell|  Male|170398.0|   Iberia|       Peru| Jarrid Dalziell|
|  5| Reinaldos|   Keeffe|  Male|440989.0| La Ronge|     Canada|Reinaldos Keeffe|
+---+----------+---------+------+--------+---------+-----------+----------------+
only showing top 5 rows



### 4.Renaming Existing Column

In [19]:
people_df = people_df.withColumnRenamed("salary","income")

In [20]:
people_df

DataFrame[id: int, first_name: string, last_name: string, gender: string, income: float, city: string, country: string]

### 5.Limit

In [21]:
people_df.limit(5).show()

+---+----------+---------+------+--------+---------+-----------+
| id|first_name|last_name|gender|  income|     city|    country|
+---+----------+---------+------+--------+---------+-----------+
|  1|     Valma|     Sans|Female|983107.0|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173.0|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898.0|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398.0|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989.0| La Ronge|     Canada|
+---+----------+---------+------+--------+---------+-----------+



### 6.OrderBy()
* Arrange data by ascending & descending order.

In [22]:
people_df.orderBy(['income'],ascending=True).show(5)

+---+----------+---------+------+-------+------------+---------+
| id|first_name|last_name|gender| income|        city|  country|
+---+----------+---------+------+-------+------------+---------+
| 93|      Cory|    Prigg|  Male|12876.0|     Gondang|Indonesia|
|590|      Flem| Tumielli|  Male|13347.0| Debre Zeyit| Ethiopia|
|192|       Odo|  Conyers|  Male|15555.0|  Raffingora| Zimbabwe|
|407|  Barbabas|Ballingal|  Male|18598.0|Beringinjaya|Indonesia|
|297|     Daron|   Melato|Female|19881.0|      Phayao| Thailand|
+---+----------+---------+------+-------+------------+---------+
only showing top 5 rows



In [23]:
people_df.orderBy(['country','income'],ascending=[True,False]).show()

+---+----------+------------+------+--------+------------------+--------------+
| id|first_name|   last_name|gender|  income|              city|       country|
+---+----------+------------+------+--------+------------------+--------------+
|490|  Cathlene|    Gatfield|Female|981605.0|           Mīrābād|   Afghanistan|
|448|      Yuri|     Duggary|  Male|414107.0|     Sang-e Māshah|   Afghanistan|
|  3|    Miltie|    De Zuani|  Male|352898.0|         Dū Qal‘ah|   Afghanistan|
|155|    Guntar|    Langmuir|  Male|290613.0|             Khōst|   Afghanistan|
|983|      Tiff|     Dreakin|Female|208548.0|             Āsmār|   Afghanistan|
|290|     Myles|      Britch|  Male|191508.0|         Dū Laīnah|   Afghanistan|
|419|   Ezekiel|   Fleetwood|  Male|163113.0|      Barakī Barak|   Afghanistan|
|701|    Gerrie|      Heigho|  Male|503327.0|             Föglö| Aland Islands|
|674|    Ludwig|    Bothwell|  Male|825171.0|         Martanesh|       Albania|
|421|    Hamnet|     Maruska|  Male|1296

### 7.Materialized View
* createOrReplaceTempView() - create a temp table using the db.

In [24]:
bank_data.createOrReplaceTempView('bankdata')

In [25]:
spark.sql("select * from bankdata").show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

### Show the Top 10 youngest Employee with Maximum Balance

In [44]:
spark.sql("SELECT age, MAX(balance) FROM bankdata GROUP BY age ORDER BY age ASC LIMIT 10").show()

+---+------------+
|age|max(balance)|
+---+------------+
| 18|        1944|
| 19|        5368|
| 20|        8860|
| 21|        8278|
| 22|       10971|
| 23|       19690|
| 24|       23878|
| 25|       16874|
| 26|       24299|
| 27|       24025|
+---+------------+



### Show the Worst 5 Job Type having Minimum Balance

In [53]:
spark.sql("SELECT job, MIN(balance) as min_bal FROM bankData GROUP BY job ORDER BY min_bal ASC LIMIT 5").show()

+-------------+-------+
|          job|min_bal|
+-------------+-------+
|  blue-collar|  -8019|
|   management|  -6847|
|self-employed|  -3313|
|   technician|  -2827|
|     services|  -2122|
+-------------+-------+

