## Pre-requisites

In [0]:
df = spark.read.csv(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/employee.csv", inferSchema=True, header=True, sep='|', quote="'").limit(10)
df.display()

## Array Type

### Create a new array type column

In [0]:
from pyspark.sql.functions import split
results_df  = df.withColumn("skills", split("col_skills", ",")).withColumn("salary", split("col_current_expected_salary", ",").cast("array<int>"))
results_df.display()  
results_df.printSchema()

In [0]:
# from pyspark.sql.functions import split
# results_df  = df.withColumn("salary", split("col_current_expected_salary", ",").cast("array<int>"))
# results_df.display()  

In [0]:
results_df.drop("col_skills", "col_current_expected_salary")

### Accessing Array columns

In [0]:
results_df.printSchema()

In [0]:
from pyspark.sql.functions import col
results_df.select(col("skills"), col("skills")[0]).display()

### Problem statement 1

Derive two new columns 'current salary' and expected salary from current_expected salary array column where the 
1. First element of an array represents current_salary_adn
2. Second element of an array represents expected_salary 

In [0]:
results_df.select(col("salary"), col("salary").getItem(0).alias("Current Salary"), col("salary").getItem(1).alias("Expected Salary")).display()


### Problem Statement 2

Fetch employee names whose current salary is more than 20000000

In [0]:
results_df1 = results_df.withColumn("Current Salary", col("salary").getItem(0)).withColumn("Expected Salary", col("salary").getItem(1))
results_df1.display()

In [0]:
results_df1.filter(col("salary")[0] >col("salary")[1]).select(col("name")).display()

In [0]:
results_df1.printSchema()

In [0]:
results_df1.filter(col("Current Salary")>col("Expected Salary")).display()

### Applying different functions on Arrays

In [0]:
from pyspark.sql.functions import size, array_contains, array_distinct
results_df1.select(col('name'), col('skills'), size('skills'), array_contains('skills', 'Hadoop'), array_distinct('skills').alias('distinct skills')).display()

### Problems STatement 3
Get list of all the employees who have skills as Pyspark

In [0]:
results_df1.select(col('name')).filter(array_contains('skills', 'PySpark')).display()

### Problem Statement 4

If the employee has Pyspark as his tech, increase the salary by by 30%

In [0]:
results_df1.filter(array_contains('skills', 'PySpark')).display()

In [0]:
results_df1.filter(array_contains('skills', 'PySpark')).withColumn("Increment Salary", col("Current Salary")*1.3).display()

In [0]:
results_df1.withColumn("Increment Salary", col("Current Salary")*1.3).filter(array_contains('skills', 'PySpark')).display()

In [0]:
from pyspark.sql.functions import when
results_df1.withColumn("Increment Salary", when(array_contains("skills", 'PySpark'), col("Current Salary")*1.3).otherwise(col("Current Salary"))).display()

In [0]:
from pyspark.sql.functions import array_contains
 
result_df.withColumn(
    "base_salary",
    when(
        array_contains("skills", "PySpark"), col("current_expected_salary")[1] * 1.3
    ).otherwise(col("current_expected_salary")[1]),
).display()

In [0]:
from pyspark.sql.functions import when
results_df2 = results_df1
results_df2.withColumn(
    "base_salary",
    when(
        array_contains("skills", "PySpark"), col("salary")[1].cast("double") * 1.3
    ).otherwise(col("salary")[1]),
).display()
 
from pyspark.sql.functions import col, array_contains, when
 
results_df3 = results_df2.withColumn(
    "base_salary",
    when(
        array_contains(col("skills"), "PySpark"),
        (col("salary")[1] * 1.3).cast("decimal(18,2)")
    ).otherwise(col("salary")[1].cast("decimal(18,2)"))
)
 
results_df3.display()
 
 

In [0]:
from pyspark.sql.functions import explode
results_df1.select(explode("skills").alias("words")).groupBy("words").count().display()

## Struct type

In [0]:
df = spark.read.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/product_Information_001.json", multiLine=True)
df.printSchema()

In [0]:
from pyspark.sql.functions import col
df.select(col("name"), col('details.screen.size')).display()

In [0]:
df.select(col("name"), col('product_id'), col('details.memory.size'), col('details.storage.capacity'), col('details.screen.size')).display()