In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('practice').getOrCreate()
spark

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from datetime import datetime

In [3]:
from pyspark.sql.functions import col,expr

In [4]:
employee_schema = StructType([
    StructField("emp_id", IntegerType(), nullable=False),
    StructField("name", StringType(), nullable=False),
    StructField("department", StringType(), nullable=True),
    StructField("salary", FloatType(), nullable=True),
    StructField("join_date", DateType(), nullable=True)
])


In [6]:
employee_data = [
    (1, "John Doe", "Engineering", 75000.0, datetime(2020, 6, 1)),
    (2, "Jane Smith", "HR", 55000.0, datetime(2018, 9, 15)),
    (3, "Alice Johnson", "Engineering", 80000.0, datetime(2021, 1, 12)),
    (4, "Bob Brown", "Marketing", 62000.0, datetime(2019, 3, 20)),
    (5, "Charlie Black", "Engineering", 72000.0, datetime(2022, 5, 5))
]


In [7]:
df = spark.createDataFrame(employee_data, schema=employee_schema)

# Show DataFrame (ACTION)
df.show()


+------+-------------+-----------+-------+----------+
|emp_id|         name| department| salary| join_date|
+------+-------------+-----------+-------+----------+
|     1|     John Doe|Engineering|75000.0|2020-06-01|
|     2|   Jane Smith|         HR|55000.0|2018-09-15|
|     3|Alice Johnson|Engineering|80000.0|2021-01-12|
|     4|    Bob Brown|  Marketing|62000.0|2019-03-20|
|     5|Charlie Black|Engineering|72000.0|2022-05-05|
+------+-------------+-----------+-------+----------+



In [8]:
from pyspark.sql.functions import col,cast

In [9]:
dff=df.select("emp_id","name","department",col("salary").cast("double"))
dff.show()


+------+-------------+-----------+-------+
|emp_id|         name| department| salary|
+------+-------------+-----------+-------+
|     1|     John Doe|Engineering|75000.0|
|     2|   Jane Smith|         HR|55000.0|
|     3|Alice Johnson|Engineering|80000.0|
|     4|    Bob Brown|  Marketing|62000.0|
|     5|Charlie Black|Engineering|72000.0|
+------+-------------+-----------+-------+



In [10]:
dff.printSchema()


root
 |-- emp_id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)



In [11]:
de=df.select("salary",expr("salary*0.2 as tax"))

In [12]:
de.show()


+-------+-------+
| salary|    tax|
+-------+-------+
|75000.0|15000.0|
|55000.0|11000.0|
|80000.0|16000.0|
|62000.0|12400.0|
|72000.0|14400.0|
+-------+-------+



In [13]:
dfe=df.withColumn("tax",col("salary")*0.2)
dfe.show()

+------+-------------+-----------+-------+----------+-------+
|emp_id|         name| department| salary| join_date|    tax|
+------+-------------+-----------+-------+----------+-------+
|     1|     John Doe|Engineering|75000.0|2020-06-01|15000.0|
|     2|   Jane Smith|         HR|55000.0|2018-09-15|11000.0|
|     3|Alice Johnson|Engineering|80000.0|2021-01-12|16000.0|
|     4|    Bob Brown|  Marketing|62000.0|2019-03-20|12400.0|
|     5|Charlie Black|Engineering|72000.0|2022-05-05|14400.0|
+------+-------------+-----------+-------+----------+-------+



In [14]:
from pyspark.sql.functions import lit

In [15]:
emp_new_cols=df.withColumn("colone",lit(1)).withColumn("coltwo",lit("two"))
emp_new_cols.show()



+------+-------------+-----------+-------+----------+------+------+
|emp_id|         name| department| salary| join_date|colone|coltwo|
+------+-------------+-----------+-------+----------+------+------+
|     1|     John Doe|Engineering|75000.0|2020-06-01|     1|   two|
|     2|   Jane Smith|         HR|55000.0|2018-09-15|     1|   two|
|     3|Alice Johnson|Engineering|80000.0|2021-01-12|     1|   two|
|     4|    Bob Brown|  Marketing|62000.0|2019-03-20|     1|   two|
|     5|Charlie Black|Engineering|72000.0|2022-05-05|     1|   two|
+------+-------------+-----------+-------+----------+------+------+



In [None]:
emp_1=df.withColumnRenamed("emp_id","employee_id")
emp_1.show()

In [17]:
#  in spark we can have space in a column name
#not recomended in prodiction env
#emp=df.drop("coltwo","colone")

In [None]:
sd=df.where("salary>70000")
sd.show()

In [None]:
sds=df.where("salary>70000")
s=sds.limit(5)
s.show()
#s.show(5)

In [18]:
#multiple column adding

columns={
    "tax":col("salary")*0.2,
    "oneNumber":lit(1),
    "columnTwo":lit("two")
}

In [19]:
df.show()

+------+-------------+-----------+-------+----------+
|emp_id|         name| department| salary| join_date|
+------+-------------+-----------+-------+----------+
|     1|     John Doe|Engineering|75000.0|2020-06-01|
|     2|   Jane Smith|         HR|55000.0|2018-09-15|
|     3|Alice Johnson|Engineering|80000.0|2021-01-12|
|     4|    Bob Brown|  Marketing|62000.0|2019-03-20|
|     5|Charlie Black|Engineering|72000.0|2022-05-05|
+------+-------------+-----------+-------+----------+



In [21]:
fin=df.withColumns(columns)
fin.show()

+------+-------------+-----------+-------+----------+-------+---------+---------+
|emp_id|         name| department| salary| join_date|    tax|oneNumber|columnTwo|
+------+-------------+-----------+-------+----------+-------+---------+---------+
|     1|     John Doe|Engineering|75000.0|2020-06-01|15000.0|        1|      two|
|     2|   Jane Smith|         HR|55000.0|2018-09-15|11000.0|        1|      two|
|     3|Alice Johnson|Engineering|80000.0|2021-01-12|16000.0|        1|      two|
|     4|    Bob Brown|  Marketing|62000.0|2019-03-20|12400.0|        1|      two|
|     5|Charlie Black|Engineering|72000.0|2022-05-05|14400.0|        1|      two|
+------+-------------+-----------+-------+----------+-------+---------+---------+

