In [0]:
# Ways of transformation:
#     RDDS - Quicker
#     Dataframe/Dataset - Easier

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [0]:
%fs ls dbfs:///FileStore/tables/



path,name,size,modificationTime
dbfs:/FileStore/tables/Address-1.xlsx,Address-1.xlsx,151315,1706079867000
dbfs:/FileStore/tables/Address-2.xlsx,Address-2.xlsx,151315,1706158276000
dbfs:/FileStore/tables/Address-3.xlsx,Address-3.xlsx,151315,1706158828000
dbfs:/FileStore/tables/Address-4.xlsx,Address-4.xlsx,151315,1706174679000
dbfs:/FileStore/tables/Address.xlsx,Address.xlsx,151315,1706076884000
dbfs:/FileStore/tables/Detail-1.csv,Detail-1.csv,208476,1706074082000
dbfs:/FileStore/tables/Detail-2.csv,Detail-2.csv,208476,1706174692000
dbfs:/FileStore/tables/Detail.csv,Detail.csv,208476,1706073658000
dbfs:/FileStore/tables/Project_1.xlsx,Project_1.xlsx,422501,1706173236000
dbfs:/FileStore/tables/contactinfo-1.txt,contactinfo-1.txt,49969,1706174684000


In [0]:
df_csv= spark.read.format("csv").load("dbfs:/FileStore/tables/csv/batch.csv")

In [0]:
type(df_csv)

Out[3]: pyspark.sql.dataframe.DataFrame

In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
|_c0|  _c1|       _c2| _c3|   _c4|       _c5|
+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv = spark.read.format("csv").option("header", True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_csv.show()


+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types  import StructType, StructField, IntegerType,StringType,DateType

In [0]:
schema=StructType([
    StructField("id",IntegerType()),
    StructField("name",StringType()),
    StructField("dob",DateType()),
    StructField("age",IntegerType()),
    StructField("salary",IntegerType()),
    StructField("department",StringType()),
])

In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header", True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_csv.show()

+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
df_csv = spark.read.format("csv").schema(schema).option("header", True).option("inferSchema",True).load("dbfs:///FileStore/tables/csv/batch.csv")

In [0]:
df_csv.show()


+---+-----+----------+----+------+----------+
| id| name|       dob| age|salary|department|
+---+-----+----------+----+------+----------+
|  1| John|1992-05-12|  30| 70000|        IT|
|  2|Alice|1997-02-28|  25| 60000|        HR|
|  3|  Bob|      null|null| 80000|        IT|
|  4|Emily|1994-11-22|  28| 65000|   Finance|
+---+-----+----------+----+------+----------+



In [0]:
# Extract Json 
df_json= spark.read.format("json").load("dbfs:///FileStore/tables/json")

In [0]:
df_json.show()

+----+----------+----------+---+------+------+
| age|department|       dob| id|  name|salary|
+----+----------+----------+---+------+------+
|  30|        IT|1992-05-12|  1|  John| 70000|
|  25|        HR|1997-02-28|  2| Alice| 60000|
|null|        IT|      null|  3|   Bob| 80000|
|  28|   Finance|1994-11-22|  4| Emily| 65000|
|  41|        HR|1981-12-18|  5| David| 90000|
|  33|   Finance|1989-07-05|  6| Susan| 75000|
|  46|        IT|1976-03-15|  7|  Mike| 95000|
|  30|   Finance|1992-06-30| 10|Sophie| 62000|
|  25|   Finance|1997-02-28|  2| Alice| 90000|
|  28|   Finance|1994-11-22|  4| Emily| 70000|
|  39|        IT|1983-10-14|  9| James| 87000|
|  30|        IT|1992-05-12|  1|  John| 70000|
|  27|        HR|1995-08-20|  8|  Lisa| 58000|
+----+----------+----------+---+------+------+



In [0]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
# Union Data
df=df_csv.union(df_json)

In [0]:
df.show()

+----+-------+----------+----+------+----------+
|  id|   name|       dob| age|salary|department|
+----+-------+----------+----+------+----------+
|   1|   John|1992-05-12|  30| 70000|        IT|
|   2|  Alice|1997-02-28|  25| 60000|        HR|
|   3|    Bob|      null|null| 80000|        IT|
|   4|  Emily|1994-11-22|  28| 65000|   Finance|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  25|     HR|1997-02-28|   2| Alice|     60000|
|null|     IT|      null|   3|   Bob|     80000|
|  28|Finance|1994-11-22|   4| Emily|     65000|
|  41|     HR|1981-12-18|   5| David|     90000|
|  33|Finance|1989-07-05|   6| Susan|     75000|
|  46|     IT|1976-03-15|   7|  Mike|     95000|
|  30|Finance|1992-06-30|  10|Sophie|     62000|
|  25|Finance|1997-02-28|   2| Alice|     90000|
|  28|Finance|1994-11-22|   4| Emily|     70000|
|  39|     IT|1983-10-14|   9| James|     87000|
|  30|     IT|1992-05-12|   1|  John|     70000|
|  27|     HR|1995-08-20|   8|  Lisa|     58000|
+----+-------+------

In [0]:
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['age', 'department', 'dob', 'id', 'name', 'salary']


In [0]:
# Colums of both dataset are not in sme order
df_json = df_json.select(df_csv.columns)

In [0]:
df_json.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
print(df_csv.columns,df_json.columns)

['id', 'name', 'dob', 'age', 'salary', 'department'] ['id', 'name', 'dob', 'age', 'salary', 'department']


In [0]:
df= df_csv.union(df_json)

In [0]:
print(df)

DataFrame[id: bigint, name: string, dob: string, age: bigint, salary: bigint, department: string]


In [0]:
df.show()

+---+------+----------+----+------+----------+
| id|  name|       dob| age|salary|department|
+---+------+----------+----+------+----------+
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  2| Alice|1997-02-28|  25| 60000|        HR|
|  3|   Bob|      null|null| 80000|        IT|
|  4| Emily|1994-11-22|  28| 65000|   Finance|
|  5| David|1981-12-18|  41| 90000|        HR|
|  6| Susan|1989-07-05|  33| 75000|   Finance|
|  7|  Mike|1976-03-15|  46| 95000|        IT|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|
|  2| Alice|1997-02-28|  25| 90000|   Finance|
|  4| Emily|1994-11-22|  28| 70000|   Finance|
|  9| James|1983-10-14|  39| 87000|        IT|
|  1|  John|1992-05-12|  30| 70000|        IT|
|  8|  Lisa|1995-08-20|  27| 58000|        HR|
+---+------+----------+----+------+----------+



In [0]:
df.json = spark.read.format("json").schema(schema).load("dbfs:/FileStore/tables/json").orderBy("id")

In [0]:
df_json.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- department: string (nullable = true)



In [0]:
# Union and UnionALL
# Union doesnot duplicate
# But in pyspark union() and unionAll() same.. use union()-- keeps duplicate

In [0]:
df = df.dropDuplicates()
# Removes the duplicate rows

In [0]:
print(df.count())

12


In [0]:
# Select Function

from pyspark.sql import functions as F #F.col(),F.max(),F.min()
from pyspark.sql.functions import *
from pyspark.sql.functions import col,max,min


In [0]:
df.select(
    "salary"

).show()

+------+
|salary|
+------+
| 70000|
| 65000|
| 60000|
| 80000|
| 75000|
| 95000|
| 90000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+



In [0]:
df.select(
    ["salary"]

).show()

+------+
|salary|
+------+
| 70000|
| 65000|
| 60000|
| 80000|
| 75000|
| 95000|
| 90000|
| 62000|
| 90000|
| 70000|
| 87000|
| 58000|
+------+



In [0]:
df.select(
    df.salary,
    df.age,

).show()
df.select(
    F.col("salary"),
    F.col("age")
).show()

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 80000|null|
| 75000|  33|
| 95000|  46|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+

+------+----+
|salary| age|
+------+----+
| 70000|  30|
| 65000|  28|
| 60000|  25|
| 80000|null|
| 75000|  33|
| 95000|  46|
| 90000|  41|
| 62000|  30|
| 90000|  25|
| 70000|  28|
| 87000|  39|
| 58000|  27|
+------+----+



In [0]:
df.select(
    df.salary+ 0.5 * df.salary,
    F.year(F.current_timestamp()) - F.year("dob"),
    F.year(F.current_timestamp()) - F.year(F.col("dob")),
).show()


+-------------------------+---------------------------------------+---------------------------------------+
|(salary + (salary * 0.5))|(year(current_timestamp()) - year(dob))|(year(current_timestamp()) - year(dob))|
+-------------------------+---------------------------------------+---------------------------------------+
|                 105000.0|                                     32|                                     32|
|                  97500.0|                                     30|                                     30|
|                  90000.0|                                     27|                                     27|
|                 120000.0|                                   null|                                   null|
|                 112500.0|                                     35|                                     35|
|                 142500.0|                                     48|                                     48|
|                 135000.0| 

In [0]:
df.select(
    df.salary+ 0.5 * df.salary,
    (df.salary+ 0.5 * df.salary),
  F.expr("salary + .05 * salary").alias("salary_raise"),
  (F.year(F.current_timestamp())- F.year (F.col ("dob"))).alias ("age")
).show()


+-------------------------+-------------------------+------------+----+
|(salary + (salary * 0.5))|(salary + (salary * 0.5))|salary_raise| age|
+-------------------------+-------------------------+------------+----+
|                 105000.0|                 105000.0|    73500.00|  32|
|                  97500.0|                  97500.0|    68250.00|  30|
|                  90000.0|                  90000.0|    63000.00|  27|
|                 120000.0|                 120000.0|    84000.00|null|
|                 112500.0|                 112500.0|    78750.00|  35|
|                 142500.0|                 142500.0|    99750.00|  48|
|                 135000.0|                 135000.0|    94500.00|  43|
|                  93000.0|                  93000.0|    65100.00|  32|
|                 135000.0|                 135000.0|    94500.00|  27|
|                 105000.0|                 105000.0|    73500.00|  30|
|                 130500.0|                 130500.0|    91350.0

In [0]:
df = df.withColumn(
    "salary_raise",
    (F.col("salary") + 0.5 * F.col("salary"))
).withColumn(
    "salary-raise_10_perc",
    (F.col("salary") + 1 * F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|
+---+------+----------+----+------+----------+------------+--------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|              120000|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|              160000|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    112500.0|              150000|
|  7|  Mike|1976-03-15|  46| 95000|        IT|    142500.0|              190000|
|  5| David|1981-12-18|  41| 90000|        HR|    135000.0|              180000|
| 10|Sophie|1992-06-30|  30| 62000|   Finance|     93000.0|              124000|
|  2| Alice|1997-02-28|  25| 90000|   Finance|    135000.0|              180000|
|  4| Emily|1994-11-22|  28|

In [0]:
df = df.withColumn(
    "salary_raise",
    (F.col("salary") +0.5 *F.col("salary"))
).withColumn(
    "salary_raise_10_pers",
    (F.col("salary") +.1 *F.col("salary"))
).withColumn(
    "salary_raise_7_pers",
    (F.col("salary") +.7 *F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|             77000.0|           119000.0|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|             71500.0|           110500.0|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|              120000|             66000.0|           102000.0|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|              160000|             88000.0|           136000.0|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    112500.0|              150000|             82500.0|           127500.0|
|  7|  Mike|1976-03-15| 

In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        "31 to 40"
    ).otherwise(
        "More than 40"
    )
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|             77000.0|           119000.0|    21 to 30|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|             71500.0|           110500.0|    21 to 30|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|              120000|             66000.0|           102000.0|    21 to 30|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|              160000|             88000.0|           136000.0|More than 40|
|  6| Susan|1989-07-05|  33| 75000|   Finance|    11250

In [0]:
df =df.withColumns(
    {
        "company": F.lit("Abascus Insights")
    }
)

In [0]:
df.select(
    # "*",
    F.col("*"),
    df.salary +.05 *df.salary,
    (df.salary +.05 *df.salary).alias("salary_raise"),
    F.expr("salary +.05 * salary").alias("salary"),
    (
        F.year(F.current_timestamp())- F.year(F.col("dob"))
    ).alias("age"),
).show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+--------------------------+------------+--------+----+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|(salary + (salary * 0.05))|salary_raise|  salary| age|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+--------------------------+------------+--------+----+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|             77000.0|           119000.0|    21 to 30|Abascus Insights|                   73500.0|     73500.0|73500.00|  32|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|                   68250.

In [0]:
df.select(
    df.salary +.05 *df.salary,
    (df.salary +.05 *df.salary).alias("salary_raise"),
    F.expr("salary +.05 * salary").alias("salary"),
    (
        F.year(F.current_timestamp())- F.year(F.col("dob"))
    ).alias("age"),
).show()

+--------------------------+------------+--------+----+
|(salary + (salary * 0.05))|salary_raise|  salary| age|
+--------------------------+------------+--------+----+
|                   73500.0|     73500.0|73500.00|  32|
|                   68250.0|     68250.0|68250.00|  30|
|                   63000.0|     63000.0|63000.00|  27|
|                   84000.0|     84000.0|84000.00|null|
|                   78750.0|     78750.0|78750.00|  35|
|                   99750.0|     99750.0|99750.00|  48|
|                   94500.0|     94500.0|94500.00|  43|
|                   65100.0|     65100.0|65100.00|  32|
|                   94500.0|     94500.0|94500.00|  27|
|                   73500.0|     73500.0|73500.00|  30|
|                   91350.0|     91350.0|91350.00|  41|
|                   60900.0|     60900.0|60900.00|  29|
+--------------------------+------------+--------+----+



In [0]:
df = df.withColumn(
    "salary_raise",
    (F.col("salary") +0.5 *F.col("salary"))
).withColumn(
    "salary_raise_10_pers",
    (F.col("salary") +.1 *F.col("salary"))
).withColumn(
    "salary_raise_7_pers",
    (F.col("salary") +.7 *F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|             77000.0|           119000.0|    21 to 30|Abascus Insights|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|              160000|        

In [0]:
df = df.withColumn(
    "salary_raise",
    (F.col("salary") +0.5 *F.col("salary"))
).withColumn(
    "salary_raise_10_pers",
    (F.col("salary") +.1 *F.col("salary"))
).withColumn(
    "salary_raise_7_pers",
    (F.col("salary") +.7 *F.col("salary"))
)
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  1|  John|1992-05-12|  30| 70000|        IT|    105000.0|              140000|             77000.0|           119000.0|    21 to 30|Abascus Insights|
|  4| Emily|1994-11-22|  28| 65000|   Finance|     97500.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  25| 60000|        HR|     90000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|    120000.0|              160000|        

In [0]:

df = df.withColumns(
    {
        "salary_raise": F.col("salary") + .05*F.col("salary"),
        "age":   F.year(F.current_timestamp())- F.year(F.col("dob"))
    }
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|    21 to 30|Abascus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|              160000|        

In [0]:
df.filter(
    F.col("salary_raise")>=75000
).show()

+---+-----+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id| name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+-----+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  3|  Bob|      null|null| 80000|        IT|     84000.0|              160000|             88000.0|           136000.0|More than 40|Abascus Insights|
|  6|Susan|1989-07-05|  35| 75000|   Finance|     78750.0|              150000|             82500.0|           127500.0|    31 to 40|Abascus Insights|
|  7| Mike|1976-03-15|  48| 95000|        IT|     99750.0|              190000|            104500.0|           161500.0|More than 40|Abascus Insights|
|  5|David|1981-12-18|  43| 90000|        HR|     94500.0|              180000|             99

In [0]:

df.where(
    F.col("salary_raise")>=75000
).show()

+---+-----+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id| name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+-----+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  3|  Bob|      null|null| 80000|        IT|     84000.0|              160000|             88000.0|           136000.0|More than 40|Abascus Insights|
|  6|Susan|1989-07-05|  35| 75000|   Finance|     78750.0|              150000|             82500.0|           127500.0|    31 to 40|Abascus Insights|
|  7| Mike|1976-03-15|  48| 95000|        IT|     99750.0|              190000|            104500.0|           161500.0|More than 40|Abascus Insights|
|  5|David|1981-12-18|  43| 90000|        HR|     94500.0|              180000|             99

In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        "31 to 40"
    ).otherwise(
        "More than 40"
    )
)

In [0]:
df = df.withColumn(
    "age_group",
    F.when(
        F.col("age") <=20,
        "Upto 20"
    ).when(
        (
            (F.col("age") >20) &
            (F.col("age") <=30)
        ),
        "21 to 30"
    ).when (
        (
            (F.col("age") > 30) &
            (F.col("age") <= 40)
        ),
        F.col("name") #col value can be assign instead of contant value
    ).otherwise(
        "More than 40"
    )
)

In [0]:
df.show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|Abascus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|              160000|        

In [0]:
# lit


In [0]:
df.withColumns(
    {
        "company": F.lit("Abascus Insights")
    }
).show()

+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
| id|  name|       dob| age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|
+---+------+----------+----+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+
|  1|  John|1992-05-12|  32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|Abascus Insights|
|  4| Emily|1994-11-22|  30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|
|  2| Alice|1997-02-28|  27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insights|
|  3|   Bob|      null|null| 80000|        IT|     84000.0|              160000|        

In [0]:
df = df.withColumns(
    {
        "age": F.coalesce(
            F.year(F.current_timestamp()) -F.year(F.col("dob")),
            F.lit(-1),
        ),
        "has_dob_1" :~(F.isnull("dob")),
        "has_dob_2" : F.col("dob").isNotNull(),
    }
)

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+---------+---------+
| id|  name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|         company|has_dob_1|has_dob_2|
+---+------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+----------------+---------+---------+
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|Abascus Insights|     true|     true|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|Abascus Insights|     true|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|Abascus Insi

In [0]:
df = df.drop ("company", "has_dob_2")

In [0]:
df.show()

+---+------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
| id|  name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|
+---+------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
|  1|  John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|
|  4| Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|
|  2| Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|     true|
|  3|   Bob|      null| -1| 80000|        IT|     84000.0|              160000|             88000.0|           136000.0|More than 40|   

In [0]:
# With Column Renamed

In [0]:
df = df.withColumnRenamed("name","first_name")

In [0]:
df.show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|     true|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|              160000|             88000.0|        

In [0]:
from pyspark.sql import Window

In [0]:
window = Window.partitionBy("age_group")
df.withColumn(
    "min_age",
    F.min("age").over(window)
).show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|min_age|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|     27|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|     true|     27|
|  2|     Alice|1997-02-28| 27| 90000|   Finance|     94500.0|              180000|             99000.0|           153000.0|    21 to 30|     true|     27|
|  4|     Emily|1994-11-22| 30| 70000|   Finance|     73500.0|  

In [0]:
# partition by - retains the no of column
# group by - can reduce column


# 2 task to be completed use partiionby

In [0]:
# load parquet 

ref_df = spark.read.format("parquet").load("dbfs:///FileStore/tables/parquet")

In [0]:
ref_df.show()

+----------+-------+--------+
|department|manager|    lead|
+----------+-------+--------+
|   Finance|  Megan|   Molly|
|        HR|   Brad|   Brian|
|        IT|  Chris|Chandler|
|  Delivery|   Leon|  Louise|
+----------+-------+--------+



In [0]:
df.join(
    ref_df,
    "department",
    "left"
).show()

+----------+---+----------+----------+---+------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+
|department| id|first_name|       dob|age|salary|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|manager|    lead|
+----------+---+----------+----------+---+------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+
|        IT|  1|      John|1992-05-12| 32| 70000|     73500.0|              140000|             77000.0|           119000.0|        John|     true|  Chris|Chandler|
|   Finance|  4|     Emily|1994-11-22| 30| 65000|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|  Megan|   Molly|
|        HR|  2|     Alice|1997-02-28| 27| 60000|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|     true|   Brad|   Brian|
|        I

In [0]:
df = df.alias("main")
ref_df =ref_df.alias("references")

In [0]:
df.join(
    ref_df,
    F.col("main.department") == F.col("references.department"),
    "left",
).show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+----------+-------+--------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|   Finance|  Megan|   Molly|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           10200

In [0]:
df.join(
    ref_df.select(
        "manager",
        "lead",
        F.col("department").alias("reference_department")
    ).alias("reference"),
    F.col("main.department") == F.col("reference.reference_department"),
    "left",
).show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+--------------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|manager|    lead|reference_department|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+--------------------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|  Chris|Chandler|                  IT|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|  Megan|   Molly|             Finance|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|        

In [0]:
df.join(
    ref_df.select(
        "manager",
        "lead",
        F.col("department").alias("reference_department")
    ).alias("reference"),
    F.col("main.department") == F.col("reference.reference_department"),
    "left",
).show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+--------------------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|manager|    lead|reference_department|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+-------+--------+--------------------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|  Chris|Chandler|                  IT|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|  Megan|   Molly|             Finance|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|        

In [0]:
# Load (Save)Data
df.write.mode("overwrite").format("csv").save("dbfs/FileStore/tables/final")

In [0]:
%fs ls dbfs/FileStore/tables/final

path,name,size,modificationTime
dbfs:/dbfs/FileStore/tables/final/_SUCCESS,_SUCCESS,0,1708500344000
dbfs:/dbfs/FileStore/tables/final/_committed_4252851249095360416,_committed_4252851249095360416,212,1708499696000
dbfs:/dbfs/FileStore/tables/final/_committed_7155410902225821073,_committed_7155410902225821073,201,1708500344000
dbfs:/dbfs/FileStore/tables/final/_committed_7859855311738685609,_committed_7859855311738685609,113,1707804436000
dbfs:/dbfs/FileStore/tables/final/_committed_vacuum3434845972709880296,_committed_vacuum3434845972709880296,96,1708499697000
dbfs:/dbfs/FileStore/tables/final/_started_4252851249095360416,_started_4252851249095360416,0,1708499696000
dbfs:/dbfs/FileStore/tables/final/_started_7155410902225821073,_started_7155410902225821073,0,1708500344000
dbfs:/dbfs/FileStore/tables/final/part-00000-tid-7155410902225821073-2e0f3439-39be-4bcb-82ec-1a14a1057a7b-327-1-c000.csv,part-00000-tid-7155410902225821073-2e0f3439-39be-4bcb-82ec-1a14a1057a7b-327-1-c000.csv,944,1708500344000


In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [0]:

%fs rm -r dbfs:/user/hive/warehouse/final

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("final")

In [0]:
df.join(
    ref_df.select(
        "manager",
        "lead",
        F.col("main.department").alias("references.department"),
    ).alias("references"),
    F.col("main.department") == F.col("references.department"),
    "left",
).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1136089594033333>:2[0m
[1;32m      1[0m df[38;5;241m.[39mjoin(
[0;32m----> 2[0m     [43mref_df[49m[38;5;241;43m.[39;49m[43mselect[49m[43m([49m
[1;32m      3[0m [43m        [49m[38;5;124;43m"[39;49m[38;5;124;43mmanager[39;49m[38;5;124;43m"[39;49m[43m,[49m
[1;32m      4[0m [43m        [49m[38;5;124;43m"[39;49m[38;5;124;43mlead[39;49m[38;5;124;43m"[39;49m[43m,[49m
[1;32m      5[0m [43m        [49m[43mF[49m[38;5;241;43m.[39;49m[43mcol[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mmain.department[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43malias[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mreferences.department[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m,[49m
[1;32m      6[0m [43m    [49m[43m)[49m[38;5;2

In [0]:
df.alias("main").join(
    ref_df.alias("reference"),
    F.col("main.department") == F.col("reference.department"),
    "left",
).select( 
        F.col("main.*"),
        F.col("reference.*")
).show()


+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+----------+-------+--------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|department|manager|    lead|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+----------+-------+--------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|        IT|  Chris|Chandler|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|   Finance|  Megan|   Molly|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           10200

In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    final|      false|
+--------+---------+-----------+



In [0]:
spark.sql("drop table final")

Out[87]: DataFrame[]

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("final")

In [0]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    final|      false|
+--------+---------+-----------+



In [0]:
%fs ls dbfs:/user/hive/warehouse/final/

path,name,size,modificationTime
dbfs:/user/hive/warehouse/final/_delta_log/,_delta_log/,0,0
dbfs:/user/hive/warehouse/final/part-00000-3616f3e2-f704-4220-a4e5-cc31d4b36d6c-c000.snappy.parquet,part-00000-3616f3e2-f704-4220-a4e5-cc31d4b36d6c-c000.snappy.parquet,4491,1708500552000


In [0]:
%sql
desc table extended default.final

col_name,data_type,comment
id,bigint,
first_name,string,
dob,string,
age,int,
salary,bigint,
department,string,
salary_raise,double,
salary-raise_10_perc,bigint,
salary_raise_10_pers,double,
salary_raise_7_pers,double,


In [0]:
%sql
desc detail default.final

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,c55a3ae0-597b-44bf-af3d-53c08dbfef72,spark_catalog.default.final,,dbfs:/user/hive/warehouse/final,2024-02-21T07:29:09.706+0000,2024-02-21T07:29:14.000+0000,List(),1,4491,Map(),1,2,"List(appendOnly, invariants)",Map()


In [0]:

%sql
SELECT * from default.final

id,first_name,dob,age,salary,department,salary_raise,salary-raise_10_perc,salary_raise_10_pers,salary_raise_7_pers,age_group,has_dob_1
1,John,1992-05-12,32,70000,IT,73500.0,140000,77000.0,119000.0,John,True
4,Emily,1994-11-22,30,65000,Finance,68250.0,130000,71500.0,110500.0,21 to 30,True
2,Alice,1997-02-28,27,60000,HR,63000.0,120000,66000.0,102000.0,21 to 30,True
3,Bob,,-1,80000,IT,84000.0,160000,88000.0,136000.0,More than 40,False
6,Susan,1989-07-05,35,75000,Finance,78750.0,150000,82500.0,127500.0,Susan,True
7,Mike,1976-03-15,48,95000,IT,99750.0,190000,104500.0,161500.0,More than 40,True
5,David,1981-12-18,43,90000,HR,94500.0,180000,99000.0,153000.0,More than 40,True
10,Sophie,1992-06-30,32,62000,Finance,65100.0,124000,68200.0,105400.0,Sophie,True
2,Alice,1997-02-28,27,90000,Finance,94500.0,180000,99000.0,153000.0,21 to 30,True
4,Emily,1994-11-22,30,70000,Finance,73500.0,140000,77000.0,119000.0,21 to 30,True


In [0]:
%sql
select * except(id, dob, age) from default.final;

first_name,salary,department,salary_raise,salary-raise_10_perc,salary_raise_10_pers,salary_raise_7_pers,age_group,has_dob_1
John,70000,IT,73500.0,140000,77000.0,119000.0,John,True
Emily,65000,Finance,68250.0,130000,71500.0,110500.0,21 to 30,True
Alice,60000,HR,63000.0,120000,66000.0,102000.0,21 to 30,True
Bob,80000,IT,84000.0,160000,88000.0,136000.0,More than 40,False
Susan,75000,Finance,78750.0,150000,82500.0,127500.0,Susan,True
Mike,95000,IT,99750.0,190000,104500.0,161500.0,More than 40,True
David,90000,HR,94500.0,180000,99000.0,153000.0,More than 40,True
Sophie,62000,Finance,65100.0,124000,68200.0,105400.0,Sophie,True
Alice,90000,Finance,94500.0,180000,99000.0,153000.0,21 to 30,True
Emily,70000,Finance,73500.0,140000,77000.0,119000.0,21 to 30,True


In [0]:
table_df = spark.table("final")

In [0]:
table_df.show()

+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
| id|first_name|       dob|age|salary|department|salary_raise|salary-raise_10_perc|salary_raise_10_pers|salary_raise_7_pers|   age_group|has_dob_1|
+---+----------+----------+---+------+----------+------------+--------------------+--------------------+-------------------+------------+---------+
|  1|      John|1992-05-12| 32| 70000|        IT|     73500.0|              140000|             77000.0|           119000.0|        John|     true|
|  4|     Emily|1994-11-22| 30| 65000|   Finance|     68250.0|              130000|             71500.0|           110500.0|    21 to 30|     true|
|  2|     Alice|1997-02-28| 27| 60000|        HR|     63000.0|              120000|             66000.0|           102000.0|    21 to 30|     true|
|  3|       Bob|      null| -1| 80000|        IT|     84000.0|              160000|             88000.0|        

In [0]:
display(table_df)

id,first_name,dob,age,salary,department,salary_raise,salary-raise_10_perc,salary_raise_10_pers,salary_raise_7_pers,age_group,has_dob_1
1,John,1992-05-12,32,70000,IT,73500.0,140000,77000.0,119000.0,John,True
4,Emily,1994-11-22,30,65000,Finance,68250.0,130000,71500.0,110500.0,21 to 30,True
2,Alice,1997-02-28,27,60000,HR,63000.0,120000,66000.0,102000.0,21 to 30,True
3,Bob,,-1,80000,IT,84000.0,160000,88000.0,136000.0,More than 40,False
6,Susan,1989-07-05,35,75000,Finance,78750.0,150000,82500.0,127500.0,Susan,True
7,Mike,1976-03-15,48,95000,IT,99750.0,190000,104500.0,161500.0,More than 40,True
5,David,1981-12-18,43,90000,HR,94500.0,180000,99000.0,153000.0,More than 40,True
10,Sophie,1992-06-30,32,62000,Finance,65100.0,124000,68200.0,105400.0,Sophie,True
2,Alice,1997-02-28,27,90000,Finance,94500.0,180000,99000.0,153000.0,21 to 30,True
4,Emily,1994-11-22,30,70000,Finance,73500.0,140000,77000.0,119000.0,21 to 30,True
