In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ReadPostgres") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/home/jovyan/jars/postgresql-42.7.2.jar") \
    .getOrCreate()



In [5]:
from pyspark.sql import functions as F

In [6]:
import pandas as pd

# Read Excel file
df_pd = pd.read_excel("/data/ITSM/raw/Sample Data file for Analysis_Jan'25.xlsx", engine='openpyxl')

# Convert to PySpark DataFrame
df_itsm = spark.createDataFrame(df_pd)

In [7]:
df_itsm.show(5, truncate=False)
df_itsm.printSchema()

+---------------------------------------------+-----------------------+----------+------------+-----------+-------------------+-------------------+--------------------------+---------+---------------------------------------------+-----------------+-------------------------------------------------------------------------+-----------------------+-----------------+-------------------------------+
|inc_business_service                         |inc_category           |inc_number|inc_priority|inc_sla_due|inc_sys_created_on |inc_resolved_at    |inc_assigned_to           |inc_state|inc_cmdb_ci                                  |inc_caller_id    |inc_short_description                                                    |inc_assignment_group   |inc_close_code   |inc_close_notes                |
+---------------------------------------------+-----------------------+----------+------------+-----------+-------------------+-------------------+--------------------------+---------+----------------------

In [8]:
df_itsm.describe().show()

+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|summary|inc_business_service|    inc_category| inc_number|inc_priority|inc_sla_due|     inc_assigned_to|inc_state|inc_cmdb_ci|     inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|
+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|  count|                4999|            4999|       4999|        4999|       4999|                4999|     4999|       4999|              4999|                 4999|                4999|           4999|                4999|
|   mean|                NULL|            NULL|       NULL|        NULL|       NULL|        

In [9]:
df_itsm.groupBy('inc_priority').count().show()

+------------+-----+
|inc_priority|count|
+------------+-----+
|3 - Moderate| 2658|
|     4 - Low| 1768|
|    2 - High|  554|
|1 - Critical|   19|
+------------+-----+



In [10]:
df_itsm.groupBy('inc_category').count().show()

+--------------------+-----+
|        inc_category|count|
+--------------------+-----+
|    Business Service|   14|
|             Finance|  104|
|Hardware-Infrastr...| 1154|
|            Software| 3488|
|             Network|   39|
|         Workstation|  184|
|       Collaboration|   10|
|          Facilities|    4|
|            Hardware|    2|
+--------------------+-----+



In [11]:
df_itsm.dtypes

[('inc_business_service', 'string'),
 ('inc_category', 'string'),
 ('inc_number', 'string'),
 ('inc_priority', 'string'),
 ('inc_sla_due', 'string'),
 ('inc_sys_created_on', 'timestamp'),
 ('inc_resolved_at', 'timestamp'),
 ('inc_assigned_to', 'string'),
 ('inc_state', 'string'),
 ('inc_cmdb_ci', 'string'),
 ('inc_caller_id', 'string'),
 ('inc_short_description', 'string'),
 ('inc_assignment_group', 'string'),
 ('inc_close_code', 'string'),
 ('inc_close_notes', 'string')]

In [12]:
conditions = []
for c, dtype in df_itsm.dtypes:
    if dtype in ["double", "float"]:
        conditions.append(~(F.col(c).isNull() | F.col(c).isnan()))
    elif dtype == "string":
        conditions.append(~(F.col(c).isNull() | (F.col(c) == "") | (F.lower(F.col(c)) == "nan")))
    else:  # timestamp, int, etc.
        conditions.append(~(F.col(c).isNull()))

# Combine all with AND by starting with 0
final_condition = conditions[0]
for cond in conditions[1:]:
    final_condition = final_condition & cond

df_cleaned = df_itsm.filter(final_condition)
df_cleaned.show(5)


+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+
|inc_business_service|    inc_category|inc_number|inc_priority|inc_sla_due| inc_sys_created_on|    inc_resolved_at|     inc_assigned_to|inc_state|         inc_cmdb_ci|    inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|
+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+
|GRP56.34 PRXX (Di...|Business Service| I68793614|3 - Moderate|    UNKNOWN|2018-10-03 00:00:36|2018-10-03 10:00:39|sn int-test-svc F...|   Closed|GRP56.34 PRXX (Di...|Event Management | Monitor status i

In [13]:
df_cleaned.describe().show()

+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|summary|inc_business_service|    inc_category| inc_number|inc_priority|inc_sla_due|     inc_assigned_to|inc_state|inc_cmdb_ci|     inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|
+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|  count|                3881|            3881|       3881|        3881|       3881|                3881|     3881|       3881|              3881|                 3881|                3881|           3881|                3881|
|   mean|                NULL|            NULL|       NULL|        NULL|       NULL|        

In [14]:
df_cleaned_built = df_itsm.dropna()
df_cleaned_built.describe().show()

+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|summary|inc_business_service|    inc_category| inc_number|inc_priority|inc_sla_due|     inc_assigned_to|inc_state|inc_cmdb_ci|     inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|
+-------+--------------------+----------------+-----------+------------+-----------+--------------------+---------+-----------+------------------+---------------------+--------------------+---------------+--------------------+
|  count|                4886|            4886|       4886|        4886|       4886|                4886|     4886|       4886|              4886|                 4886|                4886|           4886|                4886|
|   mean|                NULL|            NULL|       NULL|        NULL|       NULL|        

### Extract Year, Month, and Day from the Created Date.

In [15]:
df_cleaned = df_cleaned.withColumn("Year", F.year(F.col("inc_sys_created_on"))) \
                    .withColumn("Month", F.month(F.col("inc_sys_created_on"))) \
                    .withColumn("Day", F.day(F.col("inc_sys_created_on")))

In [16]:
df_cleaned.show(5)

+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+
|inc_business_service|    inc_category|inc_number|inc_priority|inc_sla_due| inc_sys_created_on|    inc_resolved_at|     inc_assigned_to|inc_state|         inc_cmdb_ci|    inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|Year|Month|Day|
+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+
|GRP56.34 PRXX (Di...|Business Service| I68793614|3 - Moderate|    UNKNOWN|2018-10-03 00:00:36|2018-10-03 10:00:39|sn int-test-svc F...|   Closed|GRP56.34 PR

### Calculate the average resolution time per Category and Priority.

In [17]:
df_cleaned = df_cleaned.withColumn("resoultion hours", 
                                   ((F.unix_timestamp("inc_resolved_at")) - F.unix_timestamp("inc_sys_created_on"))/3600)

In [18]:
df_cleaned.show(5)

+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|inc_business_service|    inc_category|inc_number|inc_priority|inc_sla_due| inc_sys_created_on|    inc_resolved_at|     inc_assigned_to|inc_state|         inc_cmdb_ci|    inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|Year|Month|Day|  resoultion hours|
+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|GRP56.34 PRXX (Di...|Business Service| I68793614|3 - Moderate|    UNKNOWN|2018-10-03 00:00:36|2018-

In [19]:
df_cleaned.groupBy("inc_category", "inc_priority").agg(F.avg("resoultion hours").alias("AVG")).show(truncate= False)

+-----------------------+------------+-------------------+
|inc_category           |inc_priority|AVG                |
+-----------------------+------------+-------------------+
|Network                |4 - Low     |100.89672514619885 |
|Software               |2 - High    |38.44153686982693  |
|Software               |3 - Moderate|46.92884615384616  |
|Finance                |3 - Moderate|46.33175925925926  |
|Business Service       |3 - Moderate|12.19924603174603  |
|Workstation            |4 - Low     |19.83989625167336  |
|Software               |4 - Low     |143.12214906019508 |
|Workstation            |3 - Moderate|18.95298148148148  |
|Collaboration          |4 - Low     |1.280888888888889  |
|Finance                |2 - High    |3.5927777777777776 |
|Hardware-Infrastructure|2 - High    |214.65509999999998 |
|Finance                |4 - Low     |131.60738782051283 |
|Hardware-Infrastructure|3 - Moderate|144.0966111111111  |
|Hardware-Infrastructure|4 - Low     |44.53176767676768 

### Calculate ticket closure rate per Assigned Group.

In [20]:
df_cleaned.groupBy("inc_state").count().show()

+---------+-----+
|inc_state|count|
+---------+-----+
|   Closed| 3881|
+---------+-----+



In [21]:
# Group by Assigned Group
df_closure_rate = df_itsm.groupBy("inc_assignment_group").agg(
    F.count("*").alias("total_tickets"),
    F.count(F.when(F.col("inc_state") == "Closed", True)).alias("closed_tickets")
).withColumn(
    "closure_rate", F.round((F.col("closed_tickets") / F.col("total_tickets")) * 100, 2)
)
df_closure_rate.orderBy(F.col('closure_rate').asc()).show(5,truncate=False)


+---------------------------------+-------------+--------------+------------+
|inc_assignment_group             |total_tickets|closed_tickets|closure_rate|
+---------------------------------+-------------+--------------+------------+
|ContentExcellence-Streamline-Lev2|3            |0             |0.0         |
|WebHosting-Lev2                  |2            |0             |0.0         |
|Cloud Edge-Infra-SVC-Lev2        |1            |0             |0.0         |
|FIN-CC-Cadency-ABVOps-Global     |1            |0             |0.0         |
|NxGen-CloudDatabaseSQL-SVC-Lev2  |40           |6             |15.0        |
+---------------------------------+-------------+--------------+------------+
only showing top 5 rows



### Create a Monthly Ticket Summary table aggregating the number of tickets, average resolution time, and closure rate per month.

In [22]:
df_cleaned.show(3)

+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|inc_business_service|    inc_category|inc_number|inc_priority|inc_sla_due| inc_sys_created_on|    inc_resolved_at|     inc_assigned_to|inc_state|         inc_cmdb_ci|    inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|Year|Month|Day|  resoultion hours|
+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|GRP56.34 PRXX (Di...|Business Service| I68793614|3 - Moderate|    UNKNOWN|2018-10-03 00:00:36|2018-

In [23]:
df_cleaned.groupBy('Month').agg(F.count("*").alias('total_tickets'),
                                F.avg('resoultion hours').alias('average resoultion hours'),
                                F.count(F.when(F.col("inc_state") == "Closed", True)).alias("closed_tickets")
                                ).withColumn(
                                    "closure_rate", F.round((F.col("closed_tickets") / F.col("total_tickets")) * 100, 2)
                                ).show()

+-----+-------------+------------------------+--------------+------------+
|Month|total_tickets|average resoultion hours|closed_tickets|closure_rate|
+-----+-------------+------------------------+--------------+------------+
|   10|         3881|       82.11412050158893|          3881|       100.0|
+-----+-------------+------------------------+--------------+------------+



In [24]:
df_cleaned.show(5)


+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|inc_business_service|    inc_category|inc_number|inc_priority|inc_sla_due| inc_sys_created_on|    inc_resolved_at|     inc_assigned_to|inc_state|         inc_cmdb_ci|    inc_caller_id|inc_short_description|inc_assignment_group| inc_close_code|     inc_close_notes|Year|Month|Day|  resoultion hours|
+--------------------+----------------+----------+------------+-----------+-------------------+-------------------+--------------------+---------+--------------------+-----------------+---------------------+--------------------+---------------+--------------------+----+-----+---+------------------+
|GRP56.34 PRXX (Di...|Business Service| I68793614|3 - Moderate|    UNKNOWN|2018-10-03 00:00:36|2018-

In [25]:
df_cleaned.groupBy('Day').count().show()

+---+-----+
|Day|count|
+---+-----+
|  3| 2003|
|  4| 1878|
+---+-----+



In [26]:
df_cleaned.groupBy('inc_category').agg(F.count("resoultion hours")).show()

+--------------------+-----------------------+
|        inc_category|count(resoultion hours)|
+--------------------+-----------------------+
|    Business Service|                     14|
|             Finance|                    104|
|            Software|                   3456|
|             Network|                     39|
|         Workstation|                    181|
|       Collaboration|                     10|
|Hardware-Infrastr...|                     73|
|          Facilities|                      4|
+--------------------+-----------------------+



In [27]:
df_cleaned.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/retail_db") \
    .option("dbtable", "itsm_tickets") \
    .option("user", "postgres") \
    .option("password", "password") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()


In [28]:
df_itsm.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/retail_db") \
    .option("dbtable", "itsm_tickets_raw") \
    .option("user", "postgres") \
    .option("password", "password") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()