In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "3000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/01/29 06:04:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Read the access log file
raw_df = spark.read.text("access.log").withColumnRenamed('value','log_text').repartition(60)

In [3]:
! wc -l access.log

10365152 access.log


In [4]:
raw_df.count()

                                                                                

10365152

In [5]:
raw_df.rdd.getNumPartitions()

60

In [6]:
### หลังจากบรรทัดนี้เป็นฝีมือ ChatGPT-3

In [7]:
from pyspark.sql.functions import regexp_extract,col,monotonically_increasing_id, when

In [8]:
# Extract feature columns using regular expressions
log_df = raw_df.withColumn("ip", regexp_extract(col("log_text"), "^([\\d.]+)", 1)) \
    .withColumn("request_type", regexp_extract("log_text", r"\"(.*?)\"", 1)) \
.withColumn("status", regexp_extract("log_text", r"\"\s+(\d+)", 1))\
.withColumn("size", regexp_extract("log_text", r"\"\s+\d+\s+(\d+)", 1))\
    .withColumn("timestamp", regexp_extract(col("log_text"), "\\[(.+?)\\]", 1)) \
    .withColumn("timezone", regexp_extract(col("log_text"), "\\[.+?\\s(.+?)\\]", 1))\
.withColumn("OS", regexp_extract(col("log_text"), r"\"Mozilla\/(.*?)\"", 1))


In [9]:
log_df.printSchema()

root
 |-- log_text: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- request_type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- size: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- OS: string (nullable = true)



In [10]:
from pyspark.sql.types import *

In [11]:
# split dataframe into 2 dataframes
fact_df = log_df.select("ip", "size", "status","OS")\
                .withColumn('size',col('size').cast(FloatType()))\
                .withColumn('status',col('status').cast(IntegerType())).dropna()
status_dim_df = log_df.select("status").withColumn('status',col('status').cast(IntegerType())).dropna()

In [12]:
fact_df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- size: float (nullable = true)
 |-- status: integer (nullable = true)
 |-- OS: string (nullable = true)



In [13]:
status_dim_df.printSchema()

root
 |-- status: integer (nullable = true)



In [14]:
status_dim_df.count()

                                                                                

10365152

In [15]:
fact_df.count()

                                                                                

10365152

In [16]:
status_dim_df.describe().show()



+-------+------------------+
|summary|            status|
+-------+------------------+
|  count|          10365152|
|   mean|210.14194890726156|
| stddev| 39.21424239699265|
|    min|               200|
|    max|               504|
+-------+------------------+



                                                                                

In [17]:
fact_df.describe().show()



+-------+-------------+------------------+------------------+-------------------+
|summary|           ip|              size|            status|                 OS|
+-------+-------------+------------------+------------------+-------------------+
|  count|     10365152|          10365152|          10365152|           10365152|
|   mean|         null|12433.109789996326|210.14194890726156|  5.008038585209003|
| stddev|         null|28126.544539532006| 39.21424239699267|0.08933298077682743|
|    min|1.132.107.223|               0.0|               200|                   |
|    max|99.99.188.195|         1249490.0|               504|                6.0|
+-------+-------------+------------------+------------------+-------------------+



                                                                                

In [18]:
# add primary key and foreign key to the dataframes
new_status_dim_df = status_dim_df.distinct().withColumn("status_id", monotonically_increasing_id())

# join dataframes on foreign key and primary key
fact_status_df = fact_df.join(new_status_dim_df, fact_df.status == new_status_dim_df.status, "inner")\
.select(fact_df["*"], new_status_dim_df["status_id"])

In [19]:
fact_status_df.count()

                                                                                

10365152

In [20]:
fact_status_df.groupBy('status').count().show()



+------+-------+
|status|  count|
+------+-------+
|   206|      3|
|   500|  14266|
|   504|    103|
|   502|    798|
|   301|  67553|
|   400|    586|
|   403|   5634|
|   404| 105011|
|   408|    112|
|   414|     17|
|   200|9579825|
|   304| 340228|
|   499|  50852|
|   302| 199835|
|   405|      6|
|   401|    323|
+------+-------+



                                                                                

In [21]:
new_status_dim_df.show()

                                                                                

+------+-------------+
|status|    status_id|
+------+-------------+
|   206| 369367187456|
|   500| 463856467968|
|   504| 635655159808|
|   502| 730144440320|
|   301| 867583393792|
|   400| 996432412672|
|   403|1030792151040|
|   404|1073741824000|
|   408|1168231104512|
|   414|1314259992576|
|   200|1331439861760|
|   304|1374389534720|
|   499|1477468749824|
|   302|1563368095744|
|   405|1571958030336|
|   401|1709396983808|
+------+-------------+



In [22]:
fact_status_df.show()

                                                                                

+---------------+--------+------+--------------------+------------+
|             ip|    size|status|                  OS|   status_id|
+---------------+--------+------+--------------------+------------+
| 185.118.137.99|202772.0|   206|5.0 (Windows NT 6...|369367187456|
| 87.107.218.136|    64.0|   206|4.0 (compatible; ...|369367187456|
| 87.107.218.136|    64.0|   206|4.0 (compatible; ...|369367187456|
|151.239.241.163|   857.0|   500|5.0 (Windows NT 6...|463856467968|
|    91.99.30.32|   775.0|   500|5.0 (Windows NT 1...|463856467968|
|  134.19.177.20| 34560.0|   500|5.0 (Macintosh; I...|463856467968|
|   2.190.144.28| 34197.0|   500|5.0 (Windows NT 1...|463856467968|
|151.239.241.163| 35111.0|   500|5.0 (Windows NT 6...|463856467968|
|151.239.241.163| 35111.0|   500|5.0 (Windows NT 6...|463856467968|
|  85.133.238.88|    31.0|   500|5.0 (Windows NT 1...|463856467968|
|    91.99.47.57| 35203.0|   500|5.0 (Windows NT 1...|463856467968|
|    91.99.47.57| 34011.0|   500|5.0 (Windows NT

In [23]:
fact_status_df.count()

                                                                                

10365152

In [24]:
##final_fact_status_df = fact_status_df.drop('status')
final_fact_status_df = fact_status_df

In [25]:
result_join_df = final_fact_status_df.join(new_status_dim_df,on=[final_fact_status_df.status_id == new_status_dim_df.status_id])

In [26]:
result_join_df.show()

                                                                                

+---------------+----+------+--------------------+-------------+------+-------------+
|             ip|size|status|                  OS|    status_id|status|    status_id|
+---------------+----+------+--------------------+-------------+------+-------------+
| 94.183.129.254| 0.0|   302|5.0 (Windows NT 6...|1563368095744|   302|1563368095744|
|  66.249.66.194| 0.0|   302|5.0 (compatible; ...|1563368095744|   302|1563368095744|
| 217.218.130.15| 0.0|   302|5.0 (Windows NT 6...|1563368095744|   302|1563368095744|
|  66.249.66.194| 0.0|   302|5.0 (compatible; ...|1563368095744|   302|1563368095744|
| 188.210.123.38| 0.0|   302|5.0 (Linux; Andro...|1563368095744|   302|1563368095744|
|  66.249.66.194| 0.0|   302|5.0 (compatible; ...|1563368095744|   302|1563368095744|
|     5.120.94.5| 0.0|   302|5.0 (Android 8.0....|1563368095744|   302|1563368095744|
|  66.249.66.194| 0.0|   302|5.0 (compatible; ...|1563368095744|   302|1563368095744|
|   54.36.149.82| 0.0|   302|5.0 (compatible; ...|1563

In [27]:
# split dataframe into OS Dim
OS_dim_df = log_df.select("OS").withColumn('OS',col('OS').cast(StringType())).dropna()

In [28]:
OS_dim_df.printSchema()

root
 |-- OS: string (nullable = true)



In [29]:
OS_dim_df.count()

                                                                                

10365152

In [30]:
OS_dim_df.describe().show()



+-------+-------------------+
|summary|                 OS|
+-------+-------------------+
|  count|           10365152|
|   mean|  5.008038585209003|
| stddev|0.08933298077682741|
|    min|                   |
|    max|                6.0|
+-------+-------------------+



                                                                                

In [31]:
# add primary key and foreign key to the dataframes
new_OS_dim_df = OS_dim_df.distinct().withColumn("OS_id", monotonically_increasing_id())

# join dataframes on foreign key and primary key
fact_status_OS_df = final_fact_status_df.join(new_OS_dim_df, final_fact_status_df.OS == new_OS_dim_df.OS, "inner")\
.select(final_fact_status_df["*"], new_OS_dim_df["OS_id"])

In [32]:
fact_status_OS_df.count()

                                                                                

10365152

In [33]:
fact_status_OS_df.groupBy('OS').count().show()



+--------------------+-----+
|                  OS|count|
+--------------------+-----+
|5.0 (Android 4.2....|  122|
|5.0 (Android 4.4....|   89|
|5.0 (Android 4.4....|   12|
|5.0 (Android 4.4....| 1287|
|5.0 (Android 7.0;...|   27|
|5.0 (BB10; Touch)...|    1|
|5.0 (Linux; Andro...|  249|
|5.0 (Linux; Andro...|    2|
|5.0 (Linux; Andro...|   19|
|5.0 (Linux; Andro...|   95|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   31|
|5.0 (Linux; Andro...|   34|
|5.0 (Linux; Andro...|    2|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   77|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   41|
|5.0 (Linux; Andro...|   37|
|5.0 (Linux; Andro...|   37|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [34]:
new_OS_dim_df.show()



+--------------------+-----+
|                  OS|OS_id|
+--------------------+-----+
|5.0 (Linux; Andro...|    0|
|5.0 (Linux; Andro...|    1|
|5.0 (Windows NT 6...|    2|
|5.0 (Android 4.4....|    3|
|5.0 (Windows NT 6...|    4|
|5.0 (Linux; Andro...|    5|
|5.0 (Linux; Andro...|    6|
|5.0 (Linux; Andro...|    7|
|5.0 (Windows NT 6...|    8|
|5.0 (Linux; Andro...|    9|
|5.0 (Android 4.4....|   10|
|5.0 (Linux; Andro...|   11|
|5.0 (Linux; Andro...|   12|
|5.0 (iPhone; CPU ...|   13|
|5.0 (Linux; Andro...|   14|
|5.0 (Linux; Andro...|   15|
|5.0 (Linux; Andro...|   16|
|5.0 (Android 4.2....|   17|
|5.0 (iPhone; CPU ...|   18|
|5.0 (Linux; Andro...|   19|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [35]:
fact_status_OS_df.show()



+-------------+-------+------+--------------------+-------------+-----+
|           ip|   size|status|                  OS|    status_id|OS_id|
+-------------+-------+------+--------------------+-------------+-----+
|37.137.254.94|    0.0|   302|5.0 (Android 4.2....|1563368095744|   21|
|37.137.120.56|28536.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56| 5299.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.254.94| 7146.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56|65152.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56|15848.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56|28536.0|   200|5.0 (Android 4.2....|1331439861760|   21|
| 93.114.26.83|16801.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56|16228.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56| 4401.0|   200|5.0 (Android 4.2....|1331439861760|   21|
|37.137.120.56|19182.0|   200|5.0 (Android 4.2....|1331439861760

                                                                                

In [36]:
fact_status_OS_df.count()

                                                                                

10365152

In [37]:
##final_fact_status_OS_df = fact_status_OS_df.drop('OS')
final_fact_status_OS_df = fact_status_OS_df

In [38]:
final_fact_status_OS_df

DataFrame[ip: string, size: float, status: int, OS: string, status_id: bigint, OS_id: bigint]

In [43]:
result_join_df = final_fact_status_OS_df.join(new_OS_dim_df,on=[final_fact_status_OS_df.OS_id == new_OS_dim_df.OS_id])\
.join(new_status_dim_df,on=[final_fact_status_OS_df.status_id == new_status_dim_df.status_id])

In [44]:
result_join_df.show()



+--------------+----+------+--------------------+-------------+------------+--------------------+------------+------+-------------+
|            ip|size|status|                  OS|    status_id|       OS_id|                  OS|       OS_id|status|    status_id|
+--------------+----+------+--------------------+-------------+------------+--------------------+------------+------+-------------+
|   5.115.55.98| 0.0|   302|5.0 (iPad; CPU OS...|1563368095744| 34359738398|5.0 (Linux; Andro...| 34359738398|   302|1563368095744|
|   5.115.55.98| 0.0|   302|5.0 (iPad; CPU OS...|1563368095744| 34359738398|5.0 (Linux; Andro...| 34359738398|   302|1563368095744|
|87.107.180.197| 0.0|   302|5.0 (Linux; Andro...|1563368095744| 68719476798|5.0 (Linux; U; An...| 68719476798|   302|1563368095744|
|   5.117.36.58| 0.0|   302|5.0 (Windows NT 5...|1563368095744|137438953476|5.0 (Windows NT 5...|137438953476|   302|1563368095744|
| 91.108.156.93| 0.0|   302|5.0 (Windows NT 5...|1563368095744|137438953476|

                                                                                

In [45]:
result_join_df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ip,10365152,,,1.132.107.223,99.99.188.195
size,10365152,12433.109789996326,28126.54453953264,0.0,1249490.0
status,10365152,210.14194890726156,39.21424239699265,200,504
OS,10365152,5.008038585209003,0.08933298077682761,,6.0
status_id,10365152,1.3309838485853755E12,6.637947915909843E10,369367187456,1709396983808
OS_id,10365152,9.590687921232532E11,4.681975036953393E11,0,1709396983933
OS,10365152,5.042194092827004,0.20145718172021673,,6.0
OS_id,10365152,9.590687921232532E11,4.681975036953393E11,0,1709396983933
status,10365152,210.14194890726156,39.21424239699265,200,504
