In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = (SparkSession.builder
        .appName("udf")
        .master("local[2]")
         .getOrCreate()     
        
        ) 

In [7]:
! wget -P ~/datasets https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz

--2023-08-14 15:29:07--  https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz [following]
--2023-08-14 15:29:07--  https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41002480 (39M) [application/octet-stream]
Saving to: ‘/home/train/datasets/Fire_Incidents.csv.gz’


2023-08-14 15:29:28 (1.91 MB/s) - ‘/home/train/datasets/Fire_Incidents.csv.gz’ saved [41002480/41002480]



In [8]:
! ls ~/datasets

201508_trip_data.csv  churn-telecom	     Hotel_Reviews.csv.gz  u.data
Advertising.csv       Fire_Incidents.csv.gz  iris.csv		   u.item
Churn_Modelling.csv   flo100k.csv	     retail_db		   Wine.csv


In [9]:
df = spark.read \
.option("header",True) \
.option("inferSchema",True) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Fire_Incidents.csv.gz")

In [10]:
df.count()

533598

In [11]:
len(df.columns)

80

In [23]:
# date columns
ts_cols = ["Incident Date", "Alarm DtTm", "Arrival DtTm", "Close DtTm"]
df.select(ts_cols).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |
+-------------+----------------------+----------------------+----------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|
+-------------+----------------------+----------------------+----------------------+
only showing top 4 rows



In [24]:
df.select(ts_cols).printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)



In [27]:
df.select("Alarm DtTm").show(4)

+--------------------+
|          Alarm DtTm|
+--------------------+
|06/05/2018 06:38:...|
|08/29/2019 08:09:...|
|06/14/2018 08:37:...|
|12/30/2005 10:40:...|
+--------------------+
only showing top 4 rows



In [28]:
# from string to timestamp

df.select(ts_cols).withColumn("Alarm DtTm",F.to_timestamp(F.col("Alarm DtTm"),"MM/dd/yyyy hh:mm:ss a")) \
.show(n=4, truncate=False)

+-------------+-------------------+----------------------+----------------------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |
+-------------+-------------------+----------------------+----------------------+
|06/05/2018   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|
|08/29/2019   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|
|06/14/2018   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|
|12/30/2005   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|
+-------------+-------------------+----------------------+----------------------+
only showing top 4 rows



In [29]:
df2 = df.select(ts_cols).withColumn("Alarm DtTm",F.to_timestamp(F.col("Alarm DtTm"),"MM/dd/yyyy hh:mm:ss a")) 

In [30]:
df2.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: timestamp (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)



In [31]:
# unix timestamp

df.select(ts_cols).withColumn("Alarm DtTm_Ut",F.unix_timestamp(F.col("Alarm DtTm"),"MM/dd/yyyy hh:mm:ss a")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Alarm DtTm_Ut|
+-------------+----------------------+----------------------+----------------------+-------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|1528213081   |
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|1567098565   |
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|1528997876   |
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|1135975227   |
+-------------+----------------------+----------------------+----------------------+-------------+
only showing top 4 rows



In [32]:
df3 = df.select(ts_cols).withColumn("Alarm DtTm_Ut",F.unix_timestamp(F.col("Alarm DtTm"),"MM/dd/yyyy hh:mm:ss a"))

In [33]:
#from unix timestamp to timestamp

df3.withColumn("From_DtTm_Ut", F.to_timestamp(F.col("Alarm DtTm_Ut"))).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Alarm DtTm_Ut|From_DtTm_Ut       |
+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|1528213081   |2018-06-05 18:38:01|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|1567098565   |2019-08-29 20:09:25|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|1528997876   |2018-06-14 20:37:56|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|1135975227   |2005-12-30 22:40:27|
+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
only showing top 4 rows



In [34]:
df4 = df3.withColumn("From_DtTm_Ut", F.to_timestamp(F.col("Alarm DtTm_Ut")))

In [35]:
df4.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm DtTm_Ut: long (nullable = true)
 |-- From_DtTm_Ut: timestamp (nullable = true)



In [37]:
# from string to date and Date Format

df.select(ts_cols).withColumn("Incident_Date_to_Date_Format", F.to_date(F.col("Incident Date"),"MM/dd/yyyy")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+----------------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_to_Date_Format|
+-------------+----------------------+----------------------+----------------------+----------------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05                  |
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29                  |
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14                  |
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30                  |
+-------------+----------------------+----------------------+----------------------+----------------------------+
only showing top 4 rows



In [42]:
df5 = df.select(ts_cols).withColumn("Incident_Date_D", F.to_date(F.col("Incident Date"),"MM/dd/yyyy"))

In [43]:
df5.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Incident_Date_D: date (nullable = true)



In [45]:
df5.withColumn("Incident_Date_D", F.date_format(F.col("Incident_Date_D"),"yyyy:MM:dd HH:ss")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+----------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_D |
+-------------+----------------------+----------------------+----------------------+----------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018:06:05 00:00|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019:08:29 00:00|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018:06:14 00:00|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005:12:30 00:00|
+-------------+----------------------+----------------------+----------------------+----------------+
only showing top 4 rows



In [46]:
df5.withColumn("Incident_Date_D", F.date_format(F.col("Incident_Date_D"),"yyyy-MM-dd HH:ss")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+----------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_D |
+-------------+----------------------+----------------------+----------------------+----------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05 00:00|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29 00:00|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14 00:00|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30 00:00|
+-------------+----------------------+----------------------+----------------------+----------------+
only showing top 4 rows



In [47]:
# get year from ts

df2 = df.select(ts_cols).withColumn("Alarm DtTm", F.to_timestamp(F.col("Alarm DtTm"),"MM/dd/yyyy hh:mm:ss a"))

In [49]:
df3 = df2.withColumn("Alarm_year", F.year(F.col("Alarm DtTm")))
df3.show(n=4,truncate=False)

+-------------+-------------------+----------------------+----------------------+----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_year|
+-------------+-------------------+----------------------+----------------------+----------+
|06/05/2018   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018      |
|08/29/2019   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019      |
|06/14/2018   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018      |
|12/30/2005   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005      |
+-------------+-------------------+----------------------+----------------------+----------+
only showing top 4 rows



In [50]:
# get month from ts

df4 = df2.withColumn("Alarm_month", F.month(F.col("Alarm DtTm")))
df4.show(n=4,truncate=False)

+-------------+-------------------+----------------------+----------------------+-----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_month|
+-------------+-------------------+----------------------+----------------------+-----------+
|06/05/2018   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|6          |
|08/29/2019   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|8          |
|06/14/2018   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|6          |
|12/30/2005   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|12         |
+-------------+-------------------+----------------------+----------------------+-----------+
only showing top 4 rows



In [51]:
# get day of month
df5 = df2.withColumn("Alarm_month", F.dayofmonth(F.col("Alarm DtTm")))
df5.show(n=4,truncate=False)

+-------------+-------------------+----------------------+----------------------+-----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_month|
+-------------+-------------------+----------------------+----------------------+-----------+
|06/05/2018   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|5          |
|08/29/2019   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|29         |
|06/14/2018   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|14         |
|12/30/2005   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|30         |
+-------------+-------------------+----------------------+----------------------+-----------+
only showing top 4 rows



In [52]:
# get day name
df6 = df2.withColumn("Alarm_month", F.date_format(F.col("Alarm DtTm"), "E"))
df6.show(n=4,truncate=False)

+-------------+-------------------+----------------------+----------------------+-----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_month|
+-------------+-------------------+----------------------+----------------------+-----------+
|06/05/2018   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|Tue        |
|08/29/2019   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|Thu        |
|06/14/2018   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|Thu        |
|12/30/2005   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|Fri        |
+-------------+-------------------+----------------------+----------------------+-----------+
only showing top 4 rows



In [53]:
spark.stop()