In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('IMDF').getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/tables/incident_event_log_reduced.csv',inferSchema=True,header=True)

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

In [0]:
df=df.withColumn('resolved_ts',to_timestamp(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('opened_ts',to_timestamp(df.opened_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_created_ts',to_timestamp(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_updated_ts',to_timestamp(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('closed_ts',to_timestamp(df.closed_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('resolved',to_date(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('opened',to_date(df.opened_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_created',to_date(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_updated',to_date(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('closed',to_date(df.closed_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('duration',datediff(to_date(df.resolved_at, 'dd/MM/yyyy HH:mm'),to_date(df.opened_at, 'dd/MM/yyyy HH:mm')))

In [0]:
df_unique_incidents=df.filter("incident_state=='Closed'").sort("sys_mod_count",ascending=False).dropDuplicates(["number"])

#### 1. Top 5 people with most resolved incidents

In [0]:
A1=df_unique_incidents.groupby("resolved_by").count().sort("count",ascending=False)

In [0]:
A1.show(n=5)

#### 2. Based on least average duration, find the top 5 people with maxmium number of incidents resolved

In [0]:
from pyspark.sql import functions as F

In [0]:
A2= df_unique_incidents.groupby("resolved_by").agg(F.count("duration"),F.mean("duration")).\
    withColumnRenamed("count(duration)","Incidents Resolved").\
    withColumnRenamed("avg(duration)","Average Duration").\
    orderBy(["Average Duration","Incidents Resolved"],ascending=[True,False])

In [0]:
A2.show(n=5)

#### 3. People with maximum number of high impact incidents resolved

In [0]:
A3= df_unique_incidents.select(["resolved_by","impact","duration"]).\
    groupby(["impact","resolved_by"]).count().\
    sort(["impact","count"],ascending=[True,False])

In [0]:
A3.show(n=10)

#### 4a. In each impact levels, find the person with most number of incidents resolved

In [0]:
A4a=df_unique_incidents.select(["resolved_by","impact","duration"]).\
    groupby(["impact","resolved_by"]).count().\
    sort(["impact","count"],ascending=[True,False]).\
    dropDuplicates(["impact"])

In [0]:
A4a.show()

#### 4b. In each urgency levels, find the person with most number of incidents resolved

In [0]:
A4b=df_unique_incidents.select(["resolved_by","urgency","duration"]).\
    groupby(["urgency","resolved_by"]).count().\
    sort(["urgency","count"],ascending=[True,False]).dropDuplicates(["urgency"])

In [0]:
A4b.show()

#### 4c. In each priority levels, find the person with most number of incidents resolved

In [0]:
A4c=df_unique_incidents.select(["resolved_by","priority","duration"]).\
    groupby(["priority","resolved_by"]).count().\
    sort(["priority","count"],ascending=[True,False]).dropDuplicates(["priority"])

In [0]:
A4c.show()

#### 5. Find each contact type as a percentage of total incidents

In [0]:
from pyspark.sql.window import Window

In [0]:
A5= df_unique_incidents.select(["contact_type"]).\
    groupby(["contact_type"]).count().\
    withColumn("percentage",F.round(F.col("count")*100/F.sum("count").over(Window.partitionBy()),2))

In [0]:
A5.show()

#### 6. On each priority level, find the percentage of incidents which made SLA and which did not.

In [0]:
A6= df_unique_incidents.select(["priority","made_sla"]).\
    groupby(["priority","made_sla"]).count().\
    withColumnRenamed("count","Population").\
    withColumn("Made SLA %",F.round(F.col("Population")*100/F.sum("Population").over(Window.partitionBy("priority")),2)).\
    sort(["priority","made_sla"],ascending=[True,False])

In [0]:
df_unique_incidents.select(["priority","made_sla"]).groupby(["priority","made_sla"]).count().show()

In [0]:
A6.show()

#### 7. Top 5 location with the maximum number of incidents reported

In [0]:
A7= df_unique_incidents.groupby(["location"]).agg({"number":"count"}).\
    withColumnRenamed("count(number)","Incidents Reported").sort(["Incidents Reported"],ascending=False)

In [0]:
df_unique_incidents.groupby(["location"]).count().\
withColumnRenamed("count","Incident Reported").sort(["Incident Reported"],ascending=False).show(5)

In [0]:
A7.show(5)

#### 8. Which category of issues missed meeting the SLA the most?

In [0]:
A8= df_unique_incidents.filter("made_sla==false").groupby(["category"]).\
    agg({"made_sla":"count"}).withColumnRenamed("count(made_sla)","Incidents failed to make SLA").\
    sort(["Incidents failed to make SLA"],ascending=False)

In [0]:
A8.show(5)

In [0]:
df_unique_incidents.filter("made_sla==false").groupby(["category"]).count().sort(['count'],ascending=False).show(5)