In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Content Watchtime") \
    .getOrCreate()

spark.sparkContext.setLogLevel('WARN')

# Reading CSV file.

In [22]:
data = spark.read.format('com.databricks.spark.csv') \
    .options(header=False, inferschema='true', delimiter="\t") \
    .load('sparkBigData.csv')

# UDF to remove brackets.

In [23]:
# UDF to remove [ ] from the string.
def timeStamp(timeinList):
    time = timeinList.replace("[",'').replace(']','')
    return time

timer = F.udf(lambda z:timeStamp(z),StringType())

# Pre-Processing and Calculating.

In [24]:
# Format of Input date.
format = "dd/MMM/yyyy:HH:mm:ssZ"

# Putting headers to column
# merging, processing date column
# Calculating throughput per second by dividing Bytes with send/time column.
# Getting Hour from the timestamp column which can be used for further grouping.

data = data.select(F.col('_c0').alias('Country'), F.col('_c1').alias('ASN'), \
            F.unix_timestamp(timer(F.concat_ws("",F.col('_c2'),F.col('_c3'))),format=format).cast('timestamp').alias('timestamp'),\
            F.col('_c4').alias('Metric A '), F.col('_c5').alias('Co Server'), \
            F.col('_c6').alias('Bytes'),F.col('_c7').alias('Send/Time'))\
            .withColumn('Throughput', F.col('Bytes')/F.col('Send/Time')) \
            .withColumn('hour', F.hour('timestamp'))

data = data.fillna({'Throughput':0.0})
data.show(truncate=False)

+-------+-----+-------------------+---------+---------+------+---------+------------------+----+
|Country|ASN  |timestamp          |Metric A |Co Server|Bytes |Send/Time|Throughput        |hour|
+-------+-----+-------------------+---------+---------+------+---------+------------------+----+
|de     |31334|2015-08-08 02:00:01|76410    |302      |435   |0.326    |1334.355828220859 |2   |
|pl     |50231|2015-08-08 02:00:01|126746   |200      |7400  |0.0      |0.0               |2   |
|gr     |3329 |2015-08-08 02:00:01|126474   |206      |17711 |0.0      |0.0               |2   |
|tr     |9121 |2015-08-08 02:00:02|76406    |200      |19589 |0.0      |0.0               |2   |
|se     |3301 |2015-08-08 21:59:00|76406    |200      |17960 |0.0      |0.0               |21  |
|kz     |9198 |2015-08-08 22:00:02|76406    |200      |2403  |0.0      |0.0               |22  |
|ch     |6830 |2015-08-08 22:02:22|76406    |200      |16099 |0.0      |0.0               |22  |
|rs     |21246|2015-08-09 22:4

# Grouping By Co Server, ASN and Both.

In [18]:
data.where(F.col('Throughput')!=0.0).groupBy('Co Server').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()
data.where(F.col('Throughput')!=0.0).groupBy('ASN').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()
data.where(F.col('Throughput')!=0.0).groupBy('ASN','Co Server').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()

+---------+------------------+
|Co Server|   avg(Throughput)|
+---------+------------------+
|      200|2138478.5550050954|
|      206|1077081.4964914257|
|      302| 1334.355828220859|
+---------+------------------+

+-----+------------------+
|  ASN|   avg(Throughput)|
+-----+------------------+
| 8447| 3369235.632183908|
| 3320|1523100.0164156894|
| 1901| 1505408.510638298|
|15685|1124780.4232804233|
| 9050| 601055.5555555555|
|31334| 1334.355828220859|
+-----+------------------+

+-----+---------+------------------+
|  ASN|Co Server|   avg(Throughput)|
+-----+---------+------------------+
| 8447|      200| 3369235.632183908|
| 3320|      200|1523100.0164156894|
| 1901|      206| 1505408.510638298|
|15685|      206|1124780.4232804233|
| 9050|      206| 601055.5555555555|
|31334|      302| 1334.355828220859|
+-----+---------+------------------+



# Grouping By Country-Co Server, Country-ASN . 

In [19]:
data.where(F.col('Throughput')!=0.0).groupBy('Country','ASN').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()
data.where(F.col('Throughput')!=0.0).groupBy('Country','Co Server').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()

+-------+-----+------------------+
|Country|  ASN|   avg(Throughput)|
+-------+-----+------------------+
|     at| 8447| 3369235.632183908|
|     de| 3320|1523100.0164156894|
|     at| 1901| 1505408.510638298|
|     cz|15685|1124780.4232804233|
|     ro| 9050| 601055.5555555555|
|     de|31334| 1334.355828220859|
+-------+-----+------------------+

+-------+---------+------------------+
|Country|Co Server|   avg(Throughput)|
+-------+---------+------------------+
|     at|      200| 3369235.632183908|
|     de|      200|1523100.0164156894|
|     at|      206| 1505408.510638298|
|     cz|      206|1124780.4232804233|
|     ro|      206| 601055.5555555555|
|     de|      302| 1334.355828220859|
+-------+---------+------------------+



# Grouping By Hour-Co Server, Hour-ASN .

In [20]:
data.where(F.col('Throughput')!=0.0).groupBy('hour','Co Server').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()
data.where(F.col('Throughput')!=0.0).groupBy('hour','ASN').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()

+----+---------+------------------+
|hour|Co Server|   avg(Throughput)|
+----+---------+------------------+
|   2|      200| 2895510.027662517|
|  14|      200|2104232.1521267015|
|  15|      206| 1505408.510638298|
|  17|      206|1124780.4232804233|
|  16|      200| 727154.8183254345|
|   2|      206| 601055.5555555555|
|   2|      302| 1334.355828220859|
+----+---------+------------------+

+----+-----+------------------+
|hour|  ASN|   avg(Throughput)|
+----+-----+------------------+
|   2| 8447| 5582333.333333333|
|  14| 3320| 2578279.262672811|
|  15| 1901| 1505408.510638298|
|  14| 8447|1156137.9310344828|
|  17|15685|1124780.4232804233|
|  16| 3320| 727154.8183254345|
|   2| 9050| 601055.5555555555|
|   2| 3320|208686.72199170126|
|   2|31334| 1334.355828220859|
+----+-----+------------------+



# Grouping By Country-Hour-Co Server, Country-Hour-ASN .

In [21]:
data.where(F.col('Throughput')!=0.0).groupBy('Country','hour','Co Server').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()
data.where(F.col('Throughput')!=0.0).groupBy('Country','hour','ASN').agg(F.avg('Throughput')).orderBy('avg(Throughput)', ascending=False).show()

+-------+----+---------+------------------+
|Country|hour|Co Server|   avg(Throughput)|
+-------+----+---------+------------------+
|     at|   2|      200| 5582333.333333333|
|     de|  14|      200| 2578279.262672811|
|     at|  15|      206| 1505408.510638298|
|     at|  14|      200|1156137.9310344828|
|     cz|  17|      206|1124780.4232804233|
|     de|  16|      200| 727154.8183254345|
|     ro|   2|      206| 601055.5555555555|
|     de|   2|      200|208686.72199170126|
|     de|   2|      302| 1334.355828220859|
+-------+----+---------+------------------+

+-------+----+-----+------------------+
|Country|hour|  ASN|   avg(Throughput)|
+-------+----+-----+------------------+
|     at|   2| 8447| 5582333.333333333|
|     de|  14| 3320| 2578279.262672811|
|     at|  15| 1901| 1505408.510638298|
|     at|  14| 8447|1156137.9310344828|
|     cz|  17|15685|1124780.4232804233|
|     de|  16| 3320| 727154.8183254345|
|     ro|   2| 9050| 601055.5555555555|
|     de|   2| 3320|208686.