# Prerequisites

## Load employees.csv into dataframe

In [0]:
df = spark.read.csv(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/employee.csv",
        header=True,
        inferSchema=True,
        sep = '|',
        quote="'",
    ).limit(10)
df.display()

# Array Type

## Create an array type column

In [0]:
from pyspark.sql.functions import split

result_df = df.withColumn("skills", split("col_skills", ",")).withColumn(
    "current_expected_salary",
    split("col_current_expected_salary", ",").cast("array<int>"),
)
result_df.display()

In [0]:
result_df = result_df.drop("col_skills", "col_current_expected_salary")
result_df.display()

## Accessing arrary column

In [0]:
from pyspark.sql.functions import col

result_df.select(col("skills"), col("skills").getItem(0), col("skills")[0]).display()

## Problem Statement

Derive two new columns 'current_salary' and 'expected_salary' from 'current_expected_salary' array column where the

1. First element of the array represents current_salary and
2. Second element of the array represents expected_salary

In [0]:
from pyspark.sql.functions import col

result_df.select(
    col("current_expected_salary"),
    col("current_expected_salary")[0].alias("current_salary"),
    col("current_expected_salary")[1].alias("expected_salary"),
).display()

## Problem Statement

Fetch employee names whose current_salary is greater than expected_salary

In [0]:
from pyspark.sql.functions import when

employee_names = result_df.filter(
    col("current_expected_salary")[0] > col("current_expected_salary")[1]
).select(col("name"))

employee_names.display()

# Applying different functions

## Applying different functions on array column

In [0]:
from pyspark.sql.functions import size, array_distinct, array_contains
 
result_df.select(
    col("name"),
    col("skills"),
    size("skills"),
    array_contains("skills", "PySpark"),
    array_distinct("skills").alias("distinct_skills"),
).display()

## Problem Statement

Fetch all employee names who is having skill as 'PySpark'

In [0]:
from pyspark.sql.functions import array_contains

result_df.filter(
    array_contains("skills", "PySpark"),
).select("name").display()

## Problem Statement

Add 30% to expected salary if the employee has the skill PySpark

In [0]:
from pyspark.sql.functions import array_contains

result_df.withColumn(
    "base_salary",
    when(
        array_contains("skills", "PySpark"), col("current_expected_salary")[1] * 1.3
    ).otherwise(col("current_expected_salary")[1]),
).display()

In [0]:
from pyspark.sql.functions import col, array_contains, when
 
result_df = result_df.withColumn(
    "base_salary",
    when(
        array_contains(col("skills"), "PySpark"),
        (col("current_expected_salary")[1] * 1.3).cast("decimal(18,2)")
    ).otherwise(col("current_expected_salary")[1].cast("decimal(18,2)"))
)
 
result_df.display()

## Checking skill frequency of the employee

In [0]:
from pyspark.sql.functions import explode
result_df.select(explode("skills").alias("words")).groupBy("words").count().display()

# Struct Type

In [0]:
df = spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/product_Information_001.json",
    multiLine=True,
)
df.printSchema()

In [0]:
from pyspark.sql.functions import col
df.select(col("name"), col("details.screen.size")).display()

In [0]:
from pyspark.sql.functions import col
df.select(col("name"), col("details.graphics.brand")).display()

In [0]:
from pyspark.sql.functions import col
df.select(col("name"), col('product_id'), col('details.memory.size'), col('details.storage.capacity'), col('details.screen.size')).display()