In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark=SparkSession.builder.appName('Web Log Report Analysis').enableHiveSupport().getOrCreate()

In [0]:
columns=['Timestamp','ReportType','Target','Referrer','Link','SessionId','SessionCount','PageTitle','LoadTime','ViewTime','Embedded','Cookie','HSResponseTime','PrefetchElement'
,'ElementsinHints','HintAlreadySeen','Viewedfor1sttimePrefetched','Viewed1sttimenotPrefetched','ConxSpeed','ConxType','PrevConxType','VisitstoOrder','DaystoOrder','VisitFreq'
,'PurchaseFreq','VisChip','TimeinSession','PreprocRules','Secondssincelastpage','ScreenResolution','ColorDepth','CookiesEnabled','ReferringURL','Product1stVisit','FlashVersion'
,'UserAgent','RemoteIP','Serial','TargetMatches','NormalizedTarget','ThirdPartyCookieEnabled','Dummy']

In [0]:
data =spark.read.option("delimiter", "}")\
            .csv("/FileStore/Data.txt",inferSchema='true')\
            .toDF(*columns)\
            .withColumn('SessionCount',col('SessionCount').cast('bigint'))\
            .drop(col('Dummy'))

In [0]:
data.createOrReplaceTempView('weblog')

In [0]:
df=spark.sql('select count(*) as count from weblog')
display(df)

count
219


Session Report
* The number of sessions within the selected time frame.

Session Count 
* Session count is a record of the number of times a user visits your website within a given timeframe, such as day, week, or month. A cookie on the visitor’s browser is used to track the user’s activity between the time they enter your domain and the time they leave

In [0]:
spark.sql("""select from_unixtime(Timestamp, "yyyy-MM-dd") as Date
                ,from_unixtime(Timestamp, "HH") as Time, sum(SessionCount) as SessionCountByHour
                ,(sum(SessionCount)/(select sum(SessionCount) from weblog) * 100) as Percentage  
            from weblog 
            group by Timestamp order by Time""").show()

Page Views Report
* The number of page views within the selected time frame.

Page Views
* A pageview is each time a visitor views a page on your website, regardless of how many hits are generated

In [0]:
df=spark.sql("""
select from_unixtime(Timestamp, "yyyy-MM-dd") as Date, 
        from_unixtime(Timestamp, "HH") as Time, 
        count(PageTitle) as PagesViews, 
        (count(PageTitle)/(select count(*) from weblog) * 100) as Percentage 
from  weblog group by Timestamp order by Time""")
display(df)

Date,Time,PagesViews,Percentage
2019-05-18,1,11,5.0228310502283104
2019-05-18,2,15,6.849315068493151
2019-05-18,3,30,13.6986301369863
2019-05-18,4,28,12.78538812785388
2019-05-18,5,32,14.61187214611872
2019-05-18,6,47,21.461187214611872
2019-05-18,7,22,10.04566210045662
2019-05-18,8,34,15.52511415525114


New Visitor Report
* New Visitors are the number of distinct New users that have visited the Website during a given time period.

Using Dataframe Operations

In [0]:
df=data.select(from_unixtime('Timestamp', "yyyy-MM-dd").alias('Date'),\
            from_unixtime('Timestamp', "HH").alias('Time'),'Viewedfor1sttimePrefetched')\
            .filter(col('Viewedfor1sttimePrefetched')=='YES')\
            .groupBy(col('Date'),col('Time')).agg(count('*').alias('New_Visitor')).orderBy(col('Time'))
display(df)

Date,Time,New_Visitor
2019-05-18,1,6
2019-05-18,2,8
2019-05-18,3,14
2019-05-18,4,14
2019-05-18,5,16
2019-05-18,6,30
2019-05-18,7,8
2019-05-18,8,19


Using SQL Operations

In [0]:
df=spark.sql("""select Referrer, from_unixtime(Timestamp, "yyyy-MM-dd") as Date, 
                from_unixtime(Timestamp, "HH") as Time, 
                count(Viewedfor1sttimePrefetched) as NewVisitor 
            from weblog 
                where Viewedfor1sttimePrefetched = "YES" 
            group by referrer ,Timestamp order by count(Viewedfor1sttimePrefetched) desc""")
display(df)

Referrer,Date,Time,NewVisitor
www.wechat.com,2019-05-18,6,3
www.tieba.baidu.com,2019-05-18,6,3
www.snapchat.com,2019-05-18,6,3
www.skype.com,2019-05-18,4,3
www.viber.com,2019-05-18,6,2
www.weibo.com,2019-05-18,4,2
www.vk.com,2019-05-18,3,2
www.renren.com,2019-05-18,8,2
www.taringa.net,2019-05-18,8,2
www.tumblr.com,2019-05-18,5,2


Referring Domains Report
* Referring domains are Web sites that end users visited before going to your Web site. They can indicate popular link to your Website.

In [0]:
df=spark.sql("""SELECT Referrer, sum(SessionCount) as Session, count(Referrer) as Orders, 
                    sum(split(NormalizedTarget, '/')[1]) as `Revenue(in $)` from weblog group by Referrer;""")
display(df)

Referrer,Session,Orders,Revenue(in $)
www.taringa.net,105,10,47546.0
www.snapchat.com,96,12,62749.0
www.qq.com,130,12,55367.0
www.foursquare.com,83,8,48707.0
www.tieba.baidu.com,57,6,33008.0
www.renren.com,89,13,58873.0
www.tumblr.com,80,9,33580.0
www.reddit.com,118,13,74768.0
www.wechat.com,93,8,22479.0
www.google.com,105,10,44768.0


Top IP Addresses Report
* This report ranks the IP addresses of visitors accessing your website in terms of number of sessions

In [0]:
df=spark.sql("""SELECT RemoteIP, sum(SessionCount) as Session, count(RemoteIP) as Orders, 
sum(split(NormalizedTarget, '/')[1]) as Revenue 
from weblog group by RemoteIP order by sum(SessionCount) desc""")
display(df)

RemoteIP,Session,Orders,Revenue
10.0.1.1,95,10,53226.0
192.168.1.10,86,9,52344.0
192.168.1.1,86,10,58979.0
192.168.0.10,78,8,45170.0
192.168.1.99,77,6,24334.0
192.168.55.1,74,7,26139.0
192.168.10.50,70,6,33529.0
192.168.11.1,68,7,30694.0
10.0.0.2,64,5,20707.0
192.168.1.100,62,6,22358.0


Search Query Report
* Search queries are the key words entered into Internet search engines that provided results directing end users to your Web site.
  This report depicts the top search queries that led users to your site and allows you to compare the number of page hits received by each search query

In [0]:
df=data.select(split('PreprocRules','=')[1].alias('SearchQuery'))\
            .groupBy(col('SearchQuery')).agg(count('*').alias('Requests'))\
            .orderBy(desc('Requests'))
display(df)

SearchQuery,Requests
Mobile,15
Health Care Appliances,14
Television,14
Desktop PCs,13
Watches,13
Beauty and Grooming,12
Kitchen Appliances,12
Home Entertainment,11
Laptops,11
Mens Footware,10


Payement Type

In [0]:
df=spark.sql("""select VisChip, count(VisChip) from Weblog group by VisChip""")
display(df)

VisChip,count(VisChip)
Discover,49
VISA,62
American Express,47
MasterCard,61


In [0]:
df=spark.sql("""select ConxType, count(ConxType) from weblog group by ConxType""")
display(df)

ConxType,count(ConxType)
LTE,62
CDMA,52
GSM,46
WiMax,59


Browser Used for Shopping

In [0]:
%sql

select UserAgent as Browser, count(UserAgent)  from weblog group by UserAgent;

Browser,count(UserAgent)
Opera Neon,8
Internet Explorer,10
Microsoft Edge,10
Firefox,11
Vivaldi,11
Mozilla,8
SeaMonkey,11
Safari,17
Apple Safari,9
Google Chrome,12


Device Type

In [0]:
%sql

select Serial as DeviceType, count(Serial) as countofSerial from weblog group by Serial order by countofSerial desc;

DeviceType,countofSerial
iOS 7,22
Android 5.0 to 5.1.1: Android Lollipop,21
Android 8.0 to Android 8.1: Android Oreo,19
Android 4.1 to 4.3.1: Android Jelly Bean,17
iOS 10,17
iOS 11,15
iOS 12,15
Android 9.0: Android Pie,15
Android 4.4 to 4.4.4: Android KitKat,14
iOS 8,14
