In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "3000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/01/29 17:45:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Read the access log file
raw_df = spark.read.text("access.log").withColumnRenamed('value','log_text').repartition(60)

In [3]:
! wc -l access.log

10365152 access.log


In [4]:
raw_df.count()

                                                                                

10365152

In [5]:
raw_df.rdd.getNumPartitions()

60

In [6]:
### หลังจากบรรทัดนี้เป็นฝีมือ ChatGPT-3

In [7]:
from pyspark.sql.functions import regexp_extract,col,monotonically_increasing_id, when

In [8]:
# Extract feature columns using regular expressions
log_df = raw_df.withColumn("ip", regexp_extract(col("log_text"), "^([\\d.]+)", 1)) \
    .withColumn("request_type", regexp_extract("log_text", r"\"(.*?)\"", 1)) \
.withColumn("status", regexp_extract("log_text", r"\"\s+(\d+)", 1))\
.withColumn("size", regexp_extract("log_text", r"\"\s+\d+\s+(\d+)", 1))\
    .withColumn("timestamp", regexp_extract(col("log_text"), "\\[(.+?)\\]", 1)) \
    .withColumn("timezone", regexp_extract(col("log_text"), "\\[.+?\\s(.+?)\\]", 1))\
.withColumn("agent", regexp_extract(col("log_text"), r"\"Mozilla\/(.*?)\"", 1))\
.withColumn("OS", regexp_extract(col("log_text"), "(Windows|Linux|MacOS|iOS|Android)", 1))


In [9]:
log_df.printSchema()

root
 |-- log_text: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- request_type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- size: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- agent: string (nullable = true)
 |-- OS: string (nullable = true)



In [10]:
from pyspark.sql.types import *

### สร้าง FACT

In [11]:
# split dataframe into 2 dataframes
fact_df = log_df.select("ip", "size", "status","agent","OS")\
                .withColumn('size',col('size').cast(FloatType()))\
                .withColumn('status',col('status').cast(IntegerType())).dropna()

### สร้าง DIM_STATUS

In [12]:
status_dim_df = log_df.select("status").withColumn('status',col('status').cast(IntegerType())).dropna()

In [13]:
fact_df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- size: float (nullable = true)
 |-- status: integer (nullable = true)
 |-- agent: string (nullable = true)
 |-- OS: string (nullable = true)



In [14]:
status_dim_df.printSchema()

root
 |-- status: integer (nullable = true)



In [15]:
status_dim_df.count()

                                                                                

10365152

In [16]:
fact_df.count()

                                                                                

10365152

In [17]:
status_dim_df.describe().show()



+-------+------------------+
|summary|            status|
+-------+------------------+
|  count|          10365152|
|   mean|210.14194890726156|
| stddev| 39.21424239699265|
|    min|               200|
|    max|               504|
+-------+------------------+



                                                                                

In [18]:
fact_df.describe().show()



+-------+-------------+------------------+------------------+-------------------+--------+
|summary|           ip|              size|            status|              agent|      OS|
+-------+-------------+------------------+------------------+-------------------+--------+
|  count|     10365152|          10365152|          10365152|           10365152|10365152|
|   mean|         null|12433.109789996326|210.14194890726156|  5.008038585209003|    null|
| stddev|         null|28126.544539532006| 39.21424239699265|0.08933298077682743|    null|
|    min|1.132.107.223|               0.0|               200|                   |        |
|    max|99.99.188.195|         1249490.0|               504|                6.0|     iOS|
+-------+-------------+------------------+------------------+-------------------+--------+



                                                                                

In [19]:
# add primary key and foreign key to the dataframes
new_status_dim_df = status_dim_df.distinct().withColumn("status_id", monotonically_increasing_id())

# join dataframes on foreign key and primary key
fact_status_df = fact_df.join(new_status_dim_df, fact_df.status == new_status_dim_df.status, "inner")\
.select(fact_df["*"], new_status_dim_df["status_id"])

In [20]:
fact_status_df.count()

                                                                                

10365152

In [21]:
fact_status_df.groupBy('status').count().show()



+------+-------+
|status|  count|
+------+-------+
|   206|      3|
|   500|  14266|
|   504|    103|
|   502|    798|
|   301|  67553|
|   400|    586|
|   403|   5634|
|   404| 105011|
|   408|    112|
|   414|     17|
|   200|9579825|
|   304| 340228|
|   499|  50852|
|   302| 199835|
|   405|      6|
|   401|    323|
+------+-------+



                                                                                

In [22]:
new_status_dim_df.show()

                                                                                

+------+-------------+
|status|    status_id|
+------+-------------+
|   206| 369367187456|
|   500| 463856467968|
|   504| 635655159808|
|   502| 730144440320|
|   301| 867583393792|
|   400| 996432412672|
|   403|1030792151040|
|   404|1073741824000|
|   408|1168231104512|
|   414|1314259992576|
|   200|1331439861760|
|   304|1374389534720|
|   499|1477468749824|
|   302|1563368095744|
|   405|1571958030336|
|   401|1709396983808|
+------+-------------+



In [23]:
fact_status_df.show()

                                                                                

+---------------+--------+------+--------------------+-------+------------+
|             ip|    size|status|               agent|     OS|   status_id|
+---------------+--------+------+--------------------+-------+------------+
| 87.107.218.136|    64.0|   206|4.0 (compatible; ...|Windows|369367187456|
| 87.107.218.136|    64.0|   206|4.0 (compatible; ...|Windows|369367187456|
| 185.118.137.99|202772.0|   206|5.0 (Windows NT 6...|Windows|369367187456|
|  54.38.212.174| 33077.0|   500|5.0 (Windows NT 1...|Windows|463856467968|
|  134.19.177.20|    31.0|   500|5.0 (Macintosh; I...|       |463856467968|
|    5.217.85.37|    31.0|   500|5.0 (Windows NT 1...|Windows|463856467968|
|151.239.241.163|   857.0|   500|5.0 (Windows NT 6...|Windows|463856467968|
|    91.99.30.32| 34189.0|   500|5.0 (Windows NT 1...|Windows|463856467968|
| 178.252.143.10|    31.0|   500|5.0 (Windows NT 1...|Windows|463856467968|
|  104.222.32.94| 34093.0|   500|5.0 (Windows NT 1...|Windows|463856467968|
|    91.99.3

In [24]:
fact_status_df.count()

                                                                                

10365152

In [25]:
##final_fact_status_df = fact_status_df.drop('status')
final_fact_status_df = fact_status_df

In [26]:
result_join_df = final_fact_status_df.join(new_status_dim_df,on=[final_fact_status_df.status_id == new_status_dim_df.status_id])

### สร้าง DIM_AGENT

In [27]:
# split dataframe into agent Dim
agent_dim_df = log_df.select("agent","OS")\
.withColumn('agent',col('agent').cast(StringType())).dropna()\
.withColumn('OS',col('OS').cast(StringType())).dropna()

In [28]:
agent_dim_df.printSchema()

root
 |-- agent: string (nullable = true)
 |-- OS: string (nullable = true)



In [29]:
agent_dim_df.count()

                                                                                

10365152

In [30]:
agent_dim_df.describe().show()



+-------+------------------+--------+
|summary|             agent|      OS|
+-------+------------------+--------+
|  count|          10365152|10365152|
|   mean| 5.008038585209003|    null|
| stddev|0.0893329807768274|    null|
|    min|                  |        |
|    max|               6.0|     iOS|
+-------+------------------+--------+



                                                                                

In [31]:
# add primary key and foreign key to the dataframes
new_agent_dim_df = agent_dim_df.distinct().withColumn("agent_id", monotonically_increasing_id())

# join dataframes on foreign key and primary key
fact_status_agent_df = final_fact_status_df.join(new_agent_dim_df, final_fact_status_df.agent == new_agent_dim_df.agent, "inner")\
.select(final_fact_status_df["*"], new_agent_dim_df["agent_id"])

In [32]:
fact_status_agent_df.count()

                                                                                

15397947

In [33]:
fact_status_agent_df.groupBy('agent').count().show()



+--------------------+-----+
|               agent|count|
+--------------------+-----+
|5.0 (Android 4.2....|  122|
|5.0 (Android 4.4....|   89|
|5.0 (Android 4.4....|   12|
|5.0 (Android 4.4....| 1287|
|5.0 (Android 7.0;...|   27|
|5.0 (BB10; Touch)...|    1|
|5.0 (Linux; Andro...|  249|
|5.0 (Linux; Andro...|    2|
|5.0 (Linux; Andro...|   19|
|5.0 (Linux; Andro...|   95|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   31|
|5.0 (Linux; Andro...|   34|
|5.0 (Linux; Andro...|    2|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   77|
|5.0 (Linux; Andro...|    1|
|5.0 (Linux; Andro...|   41|
|5.0 (Linux; Andro...|   37|
|5.0 (Linux; Andro...|   37|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [34]:
new_agent_dim_df.show()



+--------------------+-------+--------+
|               agent|     OS|agent_id|
+--------------------+-------+--------+
|5.0 (Linux; Andro...|  Linux|       0|
|5.0 (Linux; Andro...|  Linux|       1|
|5.0 (Windows NT 6...|Windows|       2|
|5.0 (Linux; Andro...|  Linux|       3|
|5.0 (Linux; Andro...|  Linux|       4|
|5.0 (Linux; Andro...|  Linux|       5|
|5.0 (Linux; U; An...|  Linux|       6|
|5.0 (Linux; Andro...|  Linux|       7|
|5.0 (Linux; Andro...|  Linux|       8|
|5.0 (Linux; Andro...|  Linux|       9|
|5.0 (Linux; Andro...|  Linux|      10|
|5.0 (Windows NT 6...|Windows|      11|
|5.0 (Linux; U; An...|  Linux|      12|
|5.0 (Linux; Andro...|  Linux|      13|
|5.0 (Linux; Andro...|  Linux|      14|
|5.0 (Linux; Andro...|  Linux|      15|
|5.0 (Linux; Andro...|  Linux|      16|
|5.0 (Linux; Andro...|  Linux|      17|
|5.0 (Linux; Andro...|  Linux|      18|
|5.0 (Linux; Andro...|  Linux|      19|
+--------------------+-------+--------+
only showing top 20 rows



                                                                                

In [35]:
fact_status_agent_df.show()



+-------------+-------+------+--------------------+-------+-------------+------------+
|           ip|   size|status|               agent|     OS|    status_id|    agent_id|
+-------------+-------+------+--------------------+-------+-------------+------------+
|37.137.254.94|    0.0|   302|5.0 (Android 4.2....|Android|1563368095744|824633720870|
|37.137.120.56|28536.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.120.56|15753.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.254.94|15705.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.120.56| 4120.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.120.56|13720.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.254.94|  133.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.254.94| 7713.0|   200|5.0 (Android 4.2....|Android|1331439861760|824633720870|
|37.137.120.56| 3924.0|   200|5.0 (Android 

                                                                                

In [36]:
fact_status_agent_df.count()

                                                                                

15397947

In [37]:
##final_fact_status_agent_df = fact_status_agent_df.drop('agent')
final_fact_status_agent_df = fact_status_agent_df

In [38]:
final_fact_status_agent_df

DataFrame[ip: string, size: float, status: int, agent: string, OS: string, status_id: bigint, agent_id: bigint]

In [39]:
new_agent_dim_df

DataFrame[agent: string, OS: string, agent_id: bigint]

In [40]:
result_join_df = final_fact_status_agent_df.join(new_agent_dim_df,on=[final_fact_status_agent_df.agent_id == new_agent_dim_df.agent_id])\
.join(new_status_dim_df,on=[final_fact_status_agent_df.status_id == new_status_dim_df.status_id])

In [41]:
result_join_df.show()



+---------------+----+------+--------------------+-------+-------------+------------+--------------------+-------+------------+------+-------------+
|             ip|size|status|               agent|     OS|    status_id|    agent_id|               agent|     OS|    agent_id|status|    status_id|
+---------------+----+------+--------------------+-------+-------------+------------+--------------------+-------+------------+------+-------------+
|  5.120.224.223| 0.0|   302|5.0 (Linux; Andro...|  Linux|1563368095744|  8589934658|5.0 (Linux; Andro...|  Linux|  8589934658|   302|1563368095744|
|  151.239.48.53| 0.0|   302|5.0 (iPad; CPU OS...|    iOS|1563368095744| 68719476787|5.0 (Linux; Andro...|  Linux| 68719476787|   302|1563368095744|
| 151.239.46.207| 0.0|   302|5.0 (iPad; CPU OS...|    iOS|1563368095744| 68719476787|5.0 (Linux; Andro...|  Linux| 68719476787|   302|1563368095744|
|   5.116.82.224| 0.0|   302|5.0 (Linux; Andro...|  Linux|1563368095744| 68719476798|5.0 (Linux; Andro...|

                                                                                

In [42]:
result_join_df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ip,15397947,,,1.132.107.223,99.99.188.195
size,15397947,12566.818527820624,27636.460037583496,0.0,1249490.0
status,15397947,216.61880587067873,47.92632606752844,200,504
agent,15397947,5.008038585209003,0.08933298077682761,,6.0
OS,15397947,,,,iOS
status_id,15397947,1.327651978770148E12,7.905064580893845E10,369367187456,1709396983808
agent_id,15397947,7.020609533020508E11,5.2277036211666364E11,0,1709396983956
agent,15397947,5.002900232018561,0.05379125427530267,,6.0
OS,15397947,,,,iOS
