In [38]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/itv015970/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [39]:
logschema = "loglevel string, logtime timestamp"

In [40]:
log_df = spark.read \
.format("csv") \
.schema(logschema) \
.load("/public/trendytech/datasets/logdata1m.csv")

In [41]:
log_df.show(5)

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
+--------+-------------------+
only showing top 5 rows



In [42]:
log_df.count()

1000000

In [43]:
log_df.createOrReplaceTempView("serverlogs")

In [44]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [45]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [46]:
spark.sql("select loglevel, date_format(logtime, 'MMM') as month, count(*) as total_occurance from serverlogs group by loglevel, month order by month").show()

+--------+-----+---------------+
|loglevel|month|total_occurance|
+--------+-----+---------------+
|    WARN|  Apr|           8277|
|   ERROR|  Apr|           4107|
|   FATAL|  Apr|             83|
|    INFO|  Apr|          29302|
|   DEBUG|  Apr|          41869|
|   ERROR|  Aug|           3987|
|   DEBUG|  Aug|          42147|
|    WARN|  Aug|           8381|
|   FATAL|  Aug|             80|
|    INFO|  Aug|          28993|
|   DEBUG|  Dec|          41749|
|    INFO|  Dec|          28874|
|    WARN|  Dec|           8328|
|   ERROR|  Dec|           4106|
|   FATAL|  Dec|             94|
|    INFO|  Feb|          28983|
|   ERROR|  Feb|           4013|
|    WARN|  Feb|           8266|
|   DEBUG|  Feb|          41734|
|   FATAL|  Feb|             72|
+--------+-----+---------------+
only showing top 20 rows



In [47]:
result_df = spark.sql("""select loglevel, date_format(logtime, 'MMM') as month, int(date_format(logtime, 'M')) as month_num, count(*) as total_occurance 
from serverlogs 
group by loglevel, month, month_num
order by month_num""")

In [48]:
final_df = result_df.drop("month_num")

In [49]:
final_df.show()

+--------+-----+---------------+
|loglevel|month|total_occurance|
+--------+-----+---------------+
|   ERROR|  Jan|           4054|
|   FATAL|  Jan|             94|
|    WARN|  Jan|           8217|
|    INFO|  Jan|          29119|
|   DEBUG|  Jan|          41961|
|    WARN|  Feb|           8266|
|   DEBUG|  Feb|          41734|
|   FATAL|  Feb|             72|
|    INFO|  Feb|          28983|
|   ERROR|  Feb|           4013|
|   FATAL|  Mar|             70|
|    INFO|  Mar|          29095|
|   ERROR|  Mar|           4122|
|   DEBUG|  Mar|          41652|
|    WARN|  Mar|           8165|
|   FATAL|  Apr|             83|
|    INFO|  Apr|          29302|
|   ERROR|  Apr|           4107|
|   DEBUG|  Apr|          41869|
|    WARN|  Apr|           8277|
+--------+-----+---------------+
only showing top 20 rows



In [53]:
spark.sql("select loglevel, date_format(logtime, 'MM') as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|   01|   02|   03|   04|   05|   06|   07|   08|   09|   10|   11|   12|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|29119|28983|29095|29302|28900|29143|29300|28993|29038|29018|23301|28874|
|   ERROR| 4054| 4013| 4122| 4107| 4086| 4059| 3976| 3987| 4161| 4040| 3389| 4106|
|    WARN| 8217| 8266| 8165| 8277| 8403| 8191| 8222| 8381| 8352| 8226| 6616| 8328|
|   DEBUG|41961|41734|41652|41869|41785|41774|42085|42147|41433|41936|33366|41749|
|   FATAL|   94|   72|   70|   83|   60|   78|   98|   80|   81|   92|16797|   94|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+



In [54]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|loglevel|April|August|December|February|January| July| June|March|  May|November|October|September|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|    INFO|29302| 28993|   28874|   28983|  29119|29300|29143|29095|28900|   23301|  29018|    29038|
|   ERROR| 4107|  3987|    4106|    4013|   4054| 3976| 4059| 4122| 4086|    3389|   4040|     4161|
|    WARN| 8277|  8381|    8328|    8266|   8217| 8222| 8191| 8165| 8403|    6616|   8226|     8352|
|   FATAL|   83|    80|      94|      72|     94|   98|   78|   70|   60|   16797|     92|       81|
|   DEBUG|41869| 42147|   41749|   41734|  41961|42085|41774|41652|41785|   33366|  41936|    41433|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



In [55]:
month_list = ['January', 'Febraury', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [56]:
print(month_list)

['January', 'Febraury', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']


In [57]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").groupBy('loglevel').pivot('month',month_list).count().show()

+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|loglevel|January|Febraury|March|April|  May| June| July|August|September|October|November|December|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|    INFO|  29119|    null|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|   ERROR|   4054|    null| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|    WARN|   8217|    null| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|   FATAL|     94|    null|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|   DEBUG|  41961|    null|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+

