In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark

In [0]:
employee_df = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferschema", "true")\
                .option("mode", "PERMISSIVE")\
                .load("/FileStore/tables/employee.csv")
employee_df.show(5)

+---+--------+---+------+------------+---------+
| id|    name|age|salary|     address| nominee\|
+---+--------+---+------+------------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1\|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2\|
|  3|  Pritam| 22|150000|   Bangalore|    India|
|  4|Prantosh| 17|200000|     Kolkata|    India|
|  5|  Vikash| 31|300000|        null| nominee5|
+---+--------+---+------+------------+---------+



In [0]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee\: string (nullable = true)



## Multiple Ways to select column

### 1. string method

In [0]:
employee_df.select("name").show()

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



### 2. Column method

In [0]:
employee_df.select(col("name")).show()

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



## Operations on Column

In [0]:
employee_df.select(col("id")+5).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



### Select multiple columns

In [0]:
employee_df.select("name", "salary", "age").show()

+--------+------+---+
|    name|salary|age|
+--------+------+---+
|  Manish| 75000| 26|
|  Nikita|100000| 23|
|  Pritam|150000| 22|
|Prantosh|200000| 17|
|  Vikash|300000| 31|
+--------+------+---+



In [0]:
employee_df.select(col("name"), col("age")).show()

+--------+---+
|    name|age|
+--------+---+
|  Manish| 26|
|  Nikita| 23|
|  Pritam| 22|
|Prantosh| 17|
|  Vikash| 31|
+--------+---+



In [0]:
employee_df.select("name", col("age"), employee_df['salary'], employee_df.address).show()

+--------+---+------+------------+
|    name|age|salary|     address|
+--------+---+------+------------+
|  Manish| 26| 75000|       bihar|
|  Nikita| 23|100000|uttarpradesh|
|  Pritam| 22|150000|   Bangalore|
|Prantosh| 17|200000|     Kolkata|
|  Vikash| 31|300000|        null|
+--------+---+------+------------+



### Expression

In [0]:
employee_df.select(expr("id+ 5 ")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_df.select(expr("id as employee_ID ")).show()

+-----------+
|employee_ID|
+-----------+
|          1|
|          2|
|          3|
|          4|
|          5|
+-----------+



## Spark SQL

In [0]:
employee_df.createOrReplaceTempView("emp_table")

In [0]:
spark.sql("""
          select * from emp_table
          """).show()

+---+--------+---+------+------------+---------+
| id|    name|age|salary|     address| nominee\|
+---+--------+---+------+------------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1\|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2\|
|  3|  Pritam| 22|150000|   Bangalore|    India|
|  4|Prantosh| 17|200000|     Kolkata|    India|
|  5|  Vikash| 31|300000|        null| nominee5|
+---+--------+---+------+------------+---------+

