Task 1 :create and read custome Schema 

In [0]:
data = ''' ID , NAME , AGE,SALARY
1,John,30,50000
2,Jane,25,60000
3,Mark,35,70000
4,Sara,40,80000
5,Peter,45,90000'''

In [0]:
with open("sample_csv_dataset.csv","w") as f:
    f.write(data)

In [0]:
#creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("custom_Schema").getOrCreate()

In [0]:
#defining schema 
from pyspark.sql.types import *
custom_schema = StructType([
    StructField("ID",IntegerType(),True),
    StructField("NAME",StringType(),True),
    StructField("AGE",IntegerType(),True),
    StructField("SALARY",DoubleType(),True)
])

In [0]:
df = spark.read.csv("/Workspace/Users/sbhavani@s3services.in/sample_csv_dataset.csv",header=True,inferSchema=False)
df.show()

+----+------+----+------+
| ID | NAME | AGE|SALARY|
+----+------+----+------+
|   1|  John|  30| 50000|
|   2|  Jane|  25| 60000|
|   3|  Mark|  35| 70000|
|   4|  Sara|  40| 80000|
|   5| Peter|  45| 90000|
+----+------+----+------+



In [0]:
df.printSchema()

root
 |--  ID : string (nullable = true)
 |--  NAME : string (nullable = true)
 |--  AGE: string (nullable = true)
 |-- SALARY: string (nullable = true)



In [0]:
df = spark.read.csv("/Workspace/Users/sbhavani@s3services.in/sample_csv_dataset.csv",header=True,inferSchema=False,schema=custom_schema)
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- SALARY: double (nullable = true)



DateTime & Numeric Functions Working 

In [0]:
data = [("This is a very very long string value",)]
df = spark.createDataFrame(data, ["text"])

df.show()


+--------------------+
|                text|
+--------------------+
|This is a very ve...|
+--------------------+



In [0]:
df.show(truncate=False)


+-------------------------------------+
|text                                 |
+-------------------------------------+
|This is a very very long string value|
+-------------------------------------+



In [0]:
from pyspark.sql.functions import *
data = [
    (1, "2025-12-01", "2025-12-01 10:30:00", -12.567, 16),
    (2, "2025-11-15", "2025-11-20 18:45:30", 25.432, 25),
    (3, "2025-10-10", "2025-10-12 08:15:45", -7.999, 9)
]

df = spark.createDataFrame(
    data,
    ["id", "date_str", "timestamp_str", "num_decimal", "num_int"]
)

df.show(truncate=False)


+---+----------+-------------------+-----------+-------+
|id |date_str  |timestamp_str      |num_decimal|num_int|
+---+----------+-------------------+-----------+-------+
|1  |2025-12-01|2025-12-01 10:30:00|-12.567    |16     |
|2  |2025-11-15|2025-11-20 18:45:30|25.432     |25     |
|3  |2025-10-10|2025-10-12 08:15:45|-7.999     |9      |
+---+----------+-------------------+-----------+-------+



In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date_str: string (nullable = true)
 |-- timestamp_str: string (nullable = true)
 |-- num_decimal: double (nullable = true)
 |-- num_int: long (nullable = true)



In [0]:
df_dt = df.select(
    "id",

    # Convert string to Date & Timestamp
    to_date("date_str", "yyyy-MM-dd").alias("date"),
    to_timestamp("timestamp_str", "yyyy-MM-dd HH:mm:ss").alias("timestamp")
)
df_dt.show(truncate=False)
df_dt.printSchema()

+---+----------+-------------------+
|id |date      |timestamp          |
+---+----------+-------------------+
|1  |2025-12-01|2025-12-01 10:30:00|
|2  |2025-11-15|2025-11-20 18:45:30|
|3  |2025-10-10|2025-10-12 08:15:45|
+---+----------+-------------------+

root
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [0]:
df_dt = df.select(
    "id",
    # Current date(today's date ) & timestamp(system date and time )
    current_date().alias("current_date"),
    current_timestamp().alias("current_timestamp")
)
df_dt.show(truncate=False)

+---+------------+--------------------------+
|id |current_date|current_timestamp         |
+---+------------+--------------------------+
|1  |2025-12-15  |2025-12-15 05:10:38.514001|
|2  |2025-12-15  |2025-12-15 05:10:38.514001|
|3  |2025-12-15  |2025-12-15 05:10:38.514001|
+---+------------+--------------------------+



In [0]:
df_dt = df.select(
    "id","timestamp_str",
    
    # Date formatting
    date_format("timestamp_str", "dd-MM-yyyy HH:mm").alias("formatted_ts")
)
df_dt.show(truncate=False)

+---+-------------------+----------------+
|id |timestamp_str      |formatted_ts    |
+---+-------------------+----------------+
|1  |2025-12-01 10:30:00|01-12-2025 10:30|
|2  |2025-11-20 18:45:30|20-11-2025 18:45|
|3  |2025-10-12 08:15:45|12-10-2025 08:15|
+---+-------------------+----------------+



In [0]:
df_dt = df.select(
    "id","date_str",
    # Date difference -Returns the number of days between two dates. datediff(end_date, start_date)

    datediff(current_date(), to_date("date_str")).alias("days_diff")
)
df_dt.show(truncate=False)

+---+----------+---------+
|id |date_str  |days_diff|
+---+----------+---------+
|1  |2025-12-01|14       |
|2  |2025-11-15|30       |
|3  |2025-10-10|66       |
+---+----------+---------+



In [0]:
df_dt = df.select(
    "id","date_str",
    # Month difference Returns the number of months between two dates (can be decimal). months_between(date1, date2)

    months_between(current_date(), to_date("date_str")).alias("months_diff"))
df_dt.show(truncate=False)

+---+----------+-----------+
|id |date_str  |months_diff|
+---+----------+-----------+
|1  |2025-12-01|0.4516129  |
|2  |2025-11-15|1.0        |
|3  |2025-10-10|2.16129032 |
+---+----------+-----------+



In [0]:
''' add_months(date_col, number_of_months) : Adds (or subtracts) months from a date.
        date_add(date_col, number_of_days) : Adds days to a date.
        date_sub(date_col, days):sub days to a date
    '''
df_dt = df.select(
    "id","date_str",
    # Date calculations 

    add_months(to_date("date_str"), 2).alias("add_2_months"),
    date_add(to_date("date_str"), 10).alias("add_10_days"),
    date_sub(to_date("date_str"), 5).alias("sub_5_days")
)
df_dt.show(truncate=False)

+---+----------+------------+-----------+----------+
|id |date_str  |add_2_months|add_10_days|sub_5_days|
+---+----------+------------+-----------+----------+
|1  |2025-12-01|2026-02-01  |2025-12-11 |2025-11-26|
|2  |2025-11-15|2026-01-15  |2025-11-25 |2025-11-10|
|3  |2025-10-10|2025-12-10  |2025-10-20 |2025-10-05|
+---+----------+------------+-----------+----------+



In [0]:
df_dt = df.select(
    "id",
    "date_str",
    # Extract parts
    year("date_str").alias("year"),
    month("date_str").alias("month"),
    dayofmonth("date_str").alias("day"),
    weekofyear("date_str").alias("week_no")
)

df_dt.show(truncate=False)


+---+----------+----+-----+---+-------+
|id |date_str  |year|month|day|week_no|
+---+----------+----+-----+---+-------+
|1  |2025-12-01|2025|12   |1  |49     |
|2  |2025-11-15|2025|11   |15 |46     |
|3  |2025-10-10|2025|10   |10 |41     |
+---+----------+----+-----+---+-------+



In [0]:
df_num = df.select(
    "id",
    "num_decimal",
    "num_int",

    # Absolute value
    abs("num_decimal").alias("abs_value"))
df_num.show(truncate=False)

+---+-----------+-------+---------+
|id |num_decimal|num_int|abs_value|
+---+-----------+-------+---------+
|1  |-12.567    |16     |12.567   |
|2  |25.432     |25     |25.432   |
|3  |-7.999     |9      |7.999    |
+---+-----------+-------+---------+



In [0]:
df_num = df.select(
    "id",
    "num_decimal",
    "num_int",

    # Rounding
    round("num_decimal", 2).alias("rounded"),
    floor("num_decimal").alias("floor_val"),
    ceil("num_decimal").alias("ceil_val")
)
df_num.show(truncate=False)

+---+-----------+-------+-------+---------+--------+
|id |num_decimal|num_int|rounded|floor_val|ceil_val|
+---+-----------+-------+-------+---------+--------+
|1  |-12.567    |16     |-12.57 |-13      |-12     |
|2  |25.432     |25     |25.43  |25       |26      |
|3  |-7.999     |9      |-8.0   |-8       |-7      |
+---+-----------+-------+-------+---------+--------+



In [0]:
df_num = df.select(
    "id",
    "num_decimal",
    "num_int",

    # Power & square root
    pow("num_int", 2).alias("power"),
    pow("num_int", 3).alias("cube"),
    sqrt("num_int").alias("sqrt"))
df_num.show(truncate=False)


+---+-----------+-------+-----+-------+----+
|id |num_decimal|num_int|power|cube   |sqrt|
+---+-----------+-------+-----+-------+----+
|1  |-12.567    |16     |256.0|4096.0 |4.0 |
|2  |25.432     |25     |625.0|15625.0|5.0 |
|3  |-7.999     |9      |81.0 |729.0  |3.0 |
+---+-----------+-------+-----+-------+----+



In [0]:
df_num = df.select(
    "id",
    "num_decimal",
    "num_int",

    # Greatest & Least
    greatest("num_decimal", "num_int").alias("greatest_val"),
    least("num_decimal", "num_int").alias("least_val")
)

df_num.show(truncate=False)


+---+-----------+-------+------------+---------+
|id |num_decimal|num_int|greatest_val|least_val|
+---+-----------+-------+------------+---------+
|1  |-12.567    |16     |16.0        |-12.567  |
|2  |25.432     |25     |25.432      |25.0     |
|3  |-7.999     |9      |9.0         |-7.999   |
+---+-----------+-------+------------+---------+



Array Functions 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("ArrayFunctionsDemo").getOrCreate()

data = [
    (1, ["spark", "hadoop", "spark", "sql"], [80, 90, 85],[2,3,4]),
    (2, ["python", "pyspark", "sql"], [75, 88, 92],[1,2]),
    (3, ["java", "spark"], [70, 65],[4])
]

df = spark.createDataFrame(
    data,
    ["id", "skills", "scores","ranking"]
)

df.show(truncate=False)



+---+---------------------------+------------+---------+
|id |skills                     |scores      |ranking  |
+---+---------------------------+------------+---------+
|1  |[spark, hadoop, spark, sql]|[80, 90, 85]|[2, 3, 4]|
|2  |[python, pyspark, sql]     |[75, 88, 92]|[1, 2]   |
|3  |[java, spark]              |[70, 65]    |[4]      |
+---+---------------------------+------------+---------+



In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- scores: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ranking: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [0]:
#Creates an array from multiple columns.
df_array = df.select(
    "id",
    array("skills", "scores").alias("combined_array")
)

df_array.show(truncate=False)


[0;31m---------------------------------------------------------------------------[0m
[0;31mNumberFormatException[0m                     Traceback (most recent call last)
File [0;32m<command-8458228082054899>, line 7[0m
[1;32m      1[0m [38;5;66;03m#Creates an array from multiple columns.[39;00m
[1;32m      2[0m df_array [38;5;241m=[39m df[38;5;241m.[39mselect(
[1;32m      3[0m     [38;5;124m"[39m[38;5;124mid[39m[38;5;124m"[39m,
[1;32m      4[0m     array([38;5;124m"[39m[38;5;124mskills[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mscores[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mcombined_array[39m[38;5;124m"[39m)
[1;32m      5[0m )
[0;32m----> 7[0m df_array[38;5;241m.[39mshow(truncate[38;5;241m=[39m[38;5;28;01mFalse[39;00m)

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connect/dataframe.py:1123[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1122[0m 

In [0]:
#Creates an array from multiple columns.
df_array = df.select(
    "id",
    array("ranking", "scores").alias("combined_array")
)

df_array.show(truncate=False)

+---+-------------------------+
|id |combined_array           |
+---+-------------------------+
|1  |[[2, 3, 4], [80, 90, 85]]|
|2  |[[1, 2], [75, 88, 92]]   |
|3  |[[4], [70, 65]]          |
+---+-------------------------+



In [0]:
df.select(
    "id",
    arrays_zip("skills", "scores").alias("skills_scores")
).show(truncate=False)


+---+-----------------------------------------------------+
|id |skills_scores                                        |
+---+-----------------------------------------------------+
|1  |[{spark, 80}, {hadoop, 90}, {spark, 85}, {sql, NULL}]|
|2  |[{python, 75}, {pyspark, 88}, {sql, 92}]             |
|3  |[{java, 70}, {spark, 65}]                            |
+---+-----------------------------------------------------+



In [0]:
#Check Element Existence
df.select(
    "skills",
    array_contains("skills", "spark").alias("has_spark")
).show(truncate=False)


+---------------------------+---------+
|skills                     |has_spark|
+---------------------------+---------+
|[spark, hadoop, spark, sql]|true     |
|[python, pyspark, sql]     |false    |
|[java, spark]              |true     |
+---------------------------+---------+



In [0]:
#Flatten Array into Rows
df_explode = df.select(
    "id",
    explode("skills").alias("skill")
)

df_explode.show(truncate=False)


+---+-------+
|id |skill  |
+---+-------+
|1  |spark  |
|1  |hadoop |
|1  |spark  |
|1  |sql    |
|2  |python |
|2  |pyspark|
|2  |sql    |
|3  |java   |
|3  |spark  |
+---+-------+



In [0]:
#Count Elements in Array
df.select(
    "skills",
    size("skills").alias("skill_count")
).show(truncate=False)


+---------------------------+-----------+
|skills                     |skill_count|
+---------------------------+-----------+
|[spark, hadoop, spark, sql]|4          |
|[python, pyspark, sql]     |3          |
|[java, spark]              |2          |
+---------------------------+-----------+



In [0]:
df.select(
    sort_array("skills").alias("sorted_skills"),
    sort_array("scores").alias("sorted_scores")
).show(truncate=False)


+---------------------------+-------------+
|sorted_skills              |sorted_scores|
+---------------------------+-------------+
|[hadoop, spark, spark, sql]|[80, 85, 90] |
|[pyspark, python, sql]     |[75, 88, 92] |
|[java, spark]              |[65, 70]     |
+---------------------------+-------------+



In [0]:
df.select("skills",
    array_distinct("skills").alias("unique_skills")
).show(truncate=False)


+---------------------------+----------------------+
|skills                     |unique_skills         |
+---------------------------+----------------------+
|[spark, hadoop, spark, sql]|[spark, hadoop, sql]  |
|[python, pyspark, sql]     |[python, pyspark, sql]|
|[java, spark]              |[java, spark]         |
+---------------------------+----------------------+

