In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "3000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/01/29 02:58:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Read the access log file
raw_df = spark.read.text("access.log").withColumnRenamed('value','log_text').repartition(60)

In [3]:
! wc -l access.log

10365152 access.log


In [4]:
raw_df.count()

                                                                                

10365152

In [5]:
raw_df.rdd.getNumPartitions()

60

In [6]:
### หลังจากบรรทัดนี้เป็นฝีมือ ChatGPT-3

In [7]:
from pyspark.sql.functions import regexp_extract,col,monotonically_increasing_id, when

In [8]:
# Extract feature columns using regular expressions
log_df = raw_df.withColumn("ip", regexp_extract(col("log_text"), "^([\\d.]+)", 1)) \
    .withColumn("request_type", regexp_extract("log_text", r"\"(.*?)\"", 1)) \
.withColumn("status", regexp_extract("log_text", r"\"\s+(\d+)", 1))\
.withColumn("size", regexp_extract("log_text", r"\"\s+\d+\s+(\d+)", 1))\
    .withColumn("timestamp", regexp_extract(col("log_text"), "\\[(.+?)\\]", 1)) \
    .withColumn("timezone", regexp_extract(col("log_text"), "\\[.+?\\s(.+?)\\]", 1))\
.withColumn("OS", regexp_extract(col("log_text"), r"\"Mozilla\/(.*?)\"", 1))


In [9]:
log_df.printSchema()

root
 |-- log_text: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- request_type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- size: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- OS: string (nullable = true)



In [10]:
from pyspark.sql.types import *

In [11]:
# split dataframe into 2 dataframes
fact_df = log_df.select("ip", "size", "status")\
.withColumn('size',col('size').cast(FloatType()))\
.withColumn('status',col('status').cast(IntegerType())).dropna()
dim_df = log_df.select("status").withColumn('status',col('status').cast(IntegerType())).dropna()

In [12]:
fact_df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- size: float (nullable = true)
 |-- status: integer (nullable = true)



In [13]:
dim_df.printSchema()

root
 |-- status: integer (nullable = true)



In [14]:
dim_df.count()

                                                                                

10365152

In [15]:
fact_df.count()

                                                                                

10365152

In [16]:
dim_df.describe().show()



+-------+------------------+
|summary|            status|
+-------+------------------+
|  count|          10365152|
|   mean|210.14194890726156|
| stddev|39.214242396992674|
|    min|               200|
|    max|               504|
+-------+------------------+



                                                                                

In [17]:
fact_df.describe().show()



+-------+-------------+------------------+------------------+
|summary|           ip|              size|            status|
+-------+-------------+------------------+------------------+
|  count|     10365152|          10365152|          10365152|
|   mean|         null|12433.109789996326|210.14194890726156|
| stddev|         null| 28126.54453953196|39.214242396992674|
|    min|1.132.107.223|               0.0|               200|
|    max|99.99.188.195|         1249490.0|               504|
+-------+-------------+------------------+------------------+



                                                                                

In [18]:
# add primary key and foreign key to the dataframes
new_dim_df = dim_df.distinct().withColumn("status_id", monotonically_increasing_id())

# join dataframes on foreign key and primary key
new_fact_df = fact_df.join(new_dim_df, fact_df.status == new_dim_df.status, "inner").select(fact_df["*"], new_dim_df["status_id"])

In [19]:
new_fact_df.count()

                                                                                

10365152

In [20]:
new_fact_df.groupBy('status').count().show()



+------+-------+
|status|  count|
+------+-------+
|   206|      3|
|   500|  14266|
|   504|    103|
|   502|    798|
|   301|  67553|
|   400|    586|
|   403|   5634|
|   404| 105011|
|   408|    112|
|   414|     17|
|   200|9579825|
|   304| 340228|
|   499|  50852|
|   302| 199835|
|   405|      6|
|   401|    323|
+------+-------+



                                                                                

In [21]:
new_dim_df.show()

                                                                                

+------+-------------+
|status|    status_id|
+------+-------------+
|   206| 369367187456|
|   500| 463856467968|
|   504| 635655159808|
|   502| 730144440320|
|   301| 867583393792|
|   400| 996432412672|
|   403|1030792151040|
|   404|1073741824000|
|   408|1168231104512|
|   414|1314259992576|
|   200|1331439861760|
|   304|1374389534720|
|   499|1477468749824|
|   302|1563368095744|
|   405|1571958030336|
|   401|1709396983808|
+------+-------------+



In [22]:
new_fact_df.show()

                                                                                

+---------------+--------+------+------------+
|             ip|    size|status|   status_id|
+---------------+--------+------+------------+
| 87.107.218.136|    64.0|   206|369367187456|
| 87.107.218.136|    64.0|   206|369367187456|
| 185.118.137.99|202772.0|   206|369367187456|
|151.239.241.163| 34295.0|   500|463856467968|
|   84.241.11.57|     0.0|   500|463856467968|
|151.239.241.163| 33402.0|   500|463856467968|
| 194.225.55.136|    31.0|   500|463856467968|
|195.181.168.181| 33303.0|   500|463856467968|
|   81.29.241.79|     0.0|   500|463856467968|
|  5.117.116.238| 32994.0|   500|463856467968|
|151.239.241.163|  1086.0|   500|463856467968|
|   5.78.190.233| 33056.0|   500|463856467968|
|  134.19.177.22| 33615.0|   500|463856467968|
|195.181.168.164| 33267.0|   500|463856467968|
|109.125.158.193|    31.0|   500|463856467968|
| 46.224.115.153|    31.0|   500|463856467968|
|  5.117.116.238| 34990.0|   500|463856467968|
|   65.49.68.183| 34985.0|   500|463856467968|
|  162.223.91

In [23]:
new_fact_df.count()

                                                                                

10365152

In [24]:
final_fact_df = new_fact_df.drop('status')

In [25]:
result_join_df = final_fact_df.join(new_dim_df,on=[final_fact_df.status_id == new_dim_df.status_id])

In [26]:
result_join_df.show()

                                                                                

+--------------+----+-------------+------+-------------+
|            ip|size|    status_id|status|    status_id|
+--------------+----+-------------+------+-------------+
| 78.154.37.242| 0.0|1563368095744|   302|1563368095744|
|   5.73.203.90| 0.0|1563368095744|   302|1563368095744|
| 5.116.200.165| 0.0|1563368095744|   302|1563368095744|
|204.18.128.183| 0.0|1563368095744|   302|1563368095744|
|   91.99.30.32| 0.0|1563368095744|   302|1563368095744|
|    77.42.3.85| 0.0|1563368095744|   302|1563368095744|
|  54.36.148.24| 0.0|1563368095744|   302|1563368095744|
|    89.37.5.70| 0.0|1563368095744|   302|1563368095744|
|   5.210.207.0| 0.0|1563368095744|   302|1563368095744|
| 66.249.66.194| 0.0|1563368095744|   302|1563368095744|
| 37.129.190.88| 0.0|1563368095744|   302|1563368095744|
|91.251.151.139| 0.0|1563368095744|   302|1563368095744|
|  31.58.71.174| 0.0|1563368095744|   302|1563368095744|
| 66.249.66.194| 0.0|1563368095744|   302|1563368095744|
| 2.179.116.156| 0.0|1563368095

In [27]:
result_join_df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ip,10365152,,,1.132.107.223,99.99.188.195
size,10365152,12433.109789996326,28126.54453953366,0.0,1249490.0
status_id,10365152,1.3309838485853755E12,6.637947915909843E10,369367187456,1709396983808
status,10365152,210.14194890726156,39.21424239699267,200,504
status_id,10365152,1.3309838485853755E12,6.637947915909843E10,369367187456,1709396983808
