In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "9").\
        master("spark://spark-master:7077").config('spark.jars.packages', 'com.microsoft.azure:spark-mssql-connector:1.0.2').\
        getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.microsoft.azure#spark-mssql-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6e749c58-f3c6-4272-90ad-55ad8b5cda6e;1.0
	confs: [default]
	found com.microsoft.azure#spark-mssql-connector;1.0.2 in central
	found com.microsoft.sqlserver#mssql-jdbc;8.4.1.jre8 in central
:: resolution report :: resolve 107ms :: artifacts dl 2ms
	:: modules in use:
	com.microsoft.azure#spark-mssql-connector;1.0.2 from central in [default]
	com.microsoft.sqlserver#mssql-jdbc;8.4.1.jre8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwn

In [2]:
# Read the access log file
raw_df = spark.read.text("access.log").withColumnRenamed('value','log_text').sample(0.25).repartition(60)

In [3]:
! wc -l access.log

10365152 access.log


In [4]:
raw_df.count()

                                                                                

2588856

In [5]:
raw_df.rdd.getNumPartitions()

60

In [6]:
from pyspark.sql.functions import regexp_extract,col,monotonically_increasing_id, when, udf, regexp_replace

In [7]:
# Extract feature columns using regular expressions
df = raw_df.withColumn("ip", regexp_extract(col("log_text"), "^([\\d.]+)", 1)) \
    .withColumn("request_type", regexp_extract("log_text", r"\"(.*?)\"", 1)) \
.withColumn("status", regexp_extract("log_text", r"\"\s+(\d+)", 1))\
.withColumn("size", regexp_extract("log_text", r"\"\s+\d+\s+(\d+)", 1))\
    .withColumn("timestamp", regexp_extract(col("log_text"), r'\[(.*?)\+', 1)) \
    .withColumn("timezone", regexp_extract(col("log_text"), "\\[.+?\\s(.+?)\\]", 1))\
.withColumn("agent", regexp_extract(col("log_text"), r"\"Mozilla\/(.*?)\"", 1))\
.withColumn("OS", regexp_extract(col("log_text"), "(Windows|Linux|MacOS|iOS|Android)", 1))


In [8]:
df.printSchema()

root
 |-- log_text: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- request_type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- size: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- agent: string (nullable = true)
 |-- OS: string (nullable = true)



In [9]:
raw_df.rdd.getNumPartitions()

60

In [10]:
# Import necessary libraries
from pyspark.sql.functions import monotonically_increasing_id, when, lit
from pyspark.sql.types import StringType

# Define the predefined columns for the dimension tables
dim_cols = ["ip", "status", "timestamp", "OS"]

# Replace empty values in the "OS" column with "N/A"
df = df.withColumn("OS", when(df["OS"] == "", lit("N/A")).otherwise(df["OS"]))

# Create the dimension tables and the primary keys
dim_tables = []
for dim_col in dim_cols:
  dim_table = df.select(dim_col).dropDuplicates().withColumn(f"{dim_col}_pk", monotonically_increasing_id())
  dim_tables.append(dim_table)

# Create the fact table by joining the dimension tables and adding the foreign keys
fact_table = df
for dim_table in dim_tables:
  dim_col = dim_table.columns[0]
  fact_table = fact_table.join(dim_table, dim_col, "left")
  fact_table = fact_table.withColumn(f"{dim_col}_fk", fact_table[f"{dim_col}_pk"])

In [11]:
# Drop the columns with a suffix of "_pk" and the columns that are the same in both the fact table and the dimension tables from the fact table
cols_to_drop = [col for col in fact_table.columns if "_pk" in col or col in dim_cols]
fact_table = fact_table.drop(*cols_to_drop)

In [12]:
fact_table.printSchema()

root
 |-- log_text: string (nullable = true)
 |-- request_type: string (nullable = true)
 |-- size: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- agent: string (nullable = true)
 |-- ip_fk: long (nullable = true)
 |-- status_fk: long (nullable = true)
 |-- timestamp_fk: long (nullable = true)
 |-- OS_fk: long (nullable = true)



In [13]:
fact_table.show()

                                                                                

+--------------------+--------------------+-----+--------+--------------------+-------------+------------+------------+------------+
|            log_text|        request_type| size|timezone|               agent|        ip_fk|   status_fk|timestamp_fk|       OS_fk|
+--------------------+--------------------+-----+--------+--------------------+-------------+------------+------------+------------+
|89.196.66.66 - - ...|GET /static/image...| 5807|   +0330|5.0 (iPhone; CPU ...| 489626271796| 85899345920|        1158|274877906944|
|89.196.66.66 - - ...|GET /settings/log...| 4120|   +0330|5.0 (iPhone; CPU ...| 489626271796| 85899345920|        1158|274877906944|
|89.196.66.66 - - ...|GET /static/image...| 6454|   +0330|5.0 (iPhone; CPU ...| 489626271796| 85899345920|        1158|274877906944|
|89.196.66.66 - - ...|GET /static/image...| 7713|   +0330|5.0 (iPhone; CPU ...| 489626271796| 85899345920|        1158|274877906944|
|89.196.66.66 - - ...|GET /static/image...| 3863|   +0330|5.0 (iPhone

                                                                                

In [14]:
fact_table.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
log_text,2588856,,,1.132.107.223 - - [26/Jan/2019:02:26:25 +0330]...,99.99.188.195 - - [25/Jan/2019:05:33:18 +0330]...
request_type,2588856,,,,t3 9.2.0.0
size,2588856,12432.542023581072,28224.91818821393,0,9999
timezone,2588856,330.0,0.0,+0330,+0330
agent,2588856,5.009933774834437,0.09933664825247294,,6.0
ip_fk,2588856,8.557616758753436E11,4.889046976572585E11,0,1709396984589
status_fk,2588856,1.5415186124336426E11,2.6113197495897772E11,85899345920,1657857376256
timestamp_fk,2588856,8.548644975710552E11,4.954765138927949E11,0,1709396985564
OS_fk,2588856,1.204541735819733E12,5.144568591722636E11,274877906944,1683627180032


In [15]:
dim_tables[0].describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ip,163445,,,1.132.107.223,99.99.188.195
ip_pk,163445,8.546866936016863E11,4.969138347881612E11,0,1709396984589


In [16]:
dim_tables[1].describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
status,16,384.5625,97.90333242540827,200,504
status_pk,16,9.164386467840625E11,5.075163148494362E11,85899345920,1657857376256


In [17]:
dim_tables[2].describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
timestamp,362388,,,22/Jan/2019:03:56:18,26/Jan/2019:20:29:13
timestamp_pk,362388,8.545869195180164E11,4.958780198948574E11,0,1709396985564


In [18]:
dim_tables[3].describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
OS,5,,,Android,iOS
OS_pk,5,1.0084583211008E12,5.808091753964819E11,274877906944,1683627180032


In [19]:
#fact_table.describe().toPandas().transpose()

In [20]:
for dim_table in dim_tables:
    dim_col = dim_table.columns[0]
    print(dim_col)

ip
status
timestamp
OS


In [24]:


server_name = "jdbc:sqlserver://35.209.118.36"
database_name = "webaccesslogdb"
url = server_name + ";" + "databaseName=" + database_name + ";"

table_name = "Inventory"
username = "SA"
password = "Passw0rd123456" # Please specify password here



In [25]:
# Write the dimension tables and the fact table into MSSQL
for dim_table in dim_tables:
    dim_col = dim_table.columns[0]
    print(dim_col)
    print(f"<table_name_{dim_col}>")

ip
<table_name_ip>
status
<table_name_status>
timestamp
<table_name_timestamp>
OS
<table_name_OS>


In [28]:
# Write the dimension tables and the fact table into MSSQL
for dim_table in dim_tables:
    dim_col = dim_table.columns[0]
    dim_table.coalesce(1).write.format("jdbc").options(url=url, driver="com.microsoft.sqlserver.jdbc.SQLServerDriver", dbtable=f"table_name_{dim_col}", user=username, password=password).mode("overwrite").save()
    fact_table.coalesce(1).write.format("jdbc").options(url=url, driver="com.microsoft.sqlserver.jdbc.SQLServerDriver", dbtable="table_name_fact", user=username, password=password).mode("overwrite").save()

                                                                                