# Data Collection

In [1]:
! wget https://docs.google.com/presentation/d/1-wZ-CZCstmCW-U8BmYl834xPd--f857V26WbKCilVeQ/edit#

--2022-01-24 15:06:49--  https://docs.google.com/presentation/d/1-wZ-CZCstmCW-U8BmYl834xPd--f857V26WbKCilVeQ/edit
Resolving docs.google.com (docs.google.com)... 209.85.145.139, 209.85.145.100, 209.85.145.138, ...
Connecting to docs.google.com (docs.google.com)|209.85.145.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘edit.2’

edit.2                  [ <=>                ] 631.92K  --.-KB/s    in 0.1s    

2022-01-24 15:06:50 (5.00 MB/s) - ‘edit.2’ saved [647083]



In [2]:
! head -3 nasa.dat

199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985
199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085


In [3]:
! wc -l nasa.dat

1891714 nasa.dat


# Put them into HDFS

In [4]:
! hdfs dfs -mkdir -p /rawzone/

! hdfs dfs -put nasa.dat /rawzone/

! hdfs dfs -ls /rawzone/nasa.dat

put: `/rawzone/nasa.dat': File exists
-rw-r--r--   2 root hadoop  205242368 2022-01-24 11:26 /rawzone/nasa.dat


In [5]:
raw_rdd = sc.textFile('/rawzone/nasa.dat')

raw_rdd.take(3)

                                                                                

['199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
 'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
 '199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085']

In [6]:
import time
import datetime
import re
from pyspark.sql import Row

APACHE_ACCESS_LOG_PATTERN = '(\S*) - - \[(\d{2})\/(\S*)\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) (\S*)\]'


In [7]:
def bejoindate(year,month,date):
    s = '-'
    seq = (year,month,date)
    return s.join(seq)

def bejointime(hour,minute,second):
    s = ':'
    seq = (hour,minute,second)
    return s.join(seq)

def bejoindatetime(date_name,time_name):
    s = ' '
    seq = (date_name,time_name)
    return s.join(seq)

def totimestamp(dt): ###24Jan2022
    return dt

In [8]:
def parse_apache_log_line(logline):
    pattern = re.compile(APACHE_ACCESS_LOG_PATTERN)
    result = pattern.match(logline)
    if result is None:
        return Row(
        datetime_stamp = None,
        ip_addr = None,
        day_of_month = None,
        month = None,
        year = None,
        hour = None,
        minute = None,
        second = None,
        timezone = None
        )
    else:
        return Row(
        datetime_stamp = totimestamp(bejoindatetime(bejoindate(result.group(4).zfill(2),result.group(3),result.group(2).zfill(2)),bejointime(result.group(5),result.group(6),result.group(7)))),
        ip_addr = result.group(1),
        day_of_month = result.group(2),
        month = result.group(3),
        year = result.group(4),
        hour = result.group(5),
        minute = result.group(6),
        second = result.group(7),
        timezone = result.group(8)
        )

In [9]:
parsed_rdd = raw_rdd.map(parse_apache_log_line)

parsed_rdd.take(1)


                                                                                

[Row(datetime_stamp='1995-Jul-01 00:00:01', ip_addr='199.72.81.55', day_of_month='01', month='Jul', year='1995', hour='00', minute='00', second='01', timezone='-0400')]

In [10]:
raw_df = parsed_rdd.toDF()

In [11]:
raw_df.printSchema()

root
 |-- datetime_stamp: string (nullable = true)
 |-- ip_addr: string (nullable = true)
 |-- day_of_month: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- second: string (nullable = true)
 |-- timezone: string (nullable = true)



# Transform ข้อมูลต่อไป เพื่อให้เขียนลง Hive ได้

In [12]:
from pyspark.sql.types import IntegerType, DecimalType, TimestampType

In [13]:
###24Jan2022
from pyspark.sql import functions as sparkf

final_df = raw_df.withColumn('datetime_stamp',\
            sparkf.to_timestamp(sparkf.col('datetime_stamp')\
                                ,'yyyy-MMM-dd HH:mm:ss'))

final_df.printSchema()

root
 |-- datetime_stamp: timestamp (nullable = true)
 |-- ip_addr: string (nullable = true)
 |-- day_of_month: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- second: string (nullable = true)
 |-- timezone: string (nullable = true)



In [14]:
final_df.write.mode('overwrite').saveAsTable('nasa_webaccesslog')

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
22/01/24 15:07:24 WARN org.apache.hadoop.hive.ql.session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


In [15]:
! hive -e "show tables"

Hive Session ID = cfbac1e3-c7f2-4559-8113-35e2117a5d7c

Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j2.properties Async: true
Hive Session ID = 7e2769bc-1fb3-4634-9405-554e9c50994a
OK
nasa_webaccesslog
weblogtest
Time taken: 1.383 seconds, Fetched: 2 row(s)


In [16]:
! hive -e "describe nasa_webaccesslog"

Hive Session ID = 6fc8ab12-c2ba-4134-a4f1-f486f03a878f

Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j2.properties Async: true
Hive Session ID = 9a5022a3-2b1c-45a7-9bd8-1fd8fbccef24
OK
datetime_stamp      	timestamp           	                    
ip_addr             	string              	                    
day_of_month        	string              	                    
month               	string              	                    
year                	string              	                    
hour                	string              	                    
minute              	string              	                    
second              	string              	                    
timezone            	string              	                    
Time taken: 1.384 seconds, Fetched: 9 row(s)


In [17]:
! hive -e "select * from nasa_webaccesslog where datetime_stamp between '1995-07-01' and '1995-08-01' limit 10"

Hive Session ID = 093212e0-d7e1-4635-9c98-722962cfd8ac

Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j2.properties Async: true
Hive Session ID = 1a404950-85b9-4701-afed-d76990735a8b
Query ID = root_20220124150748_ff2f5333-8125-48df-826e-618a74c22ae1
Total jobs = 1
Launching Job 1 out of 1
Status: Running (Executing on YARN cluster with App id application_1643023309979_0021)

[2K----------------------------------------------------------------------------------------------
[2K[36;1m        VERTICES      MODE        STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED  
[22;0m[2K----------------------------------------------------------------------------------------------
[2KMap 1            container        INITED      1          0        0        1       0       0  
[2K----------------------------------------------------------------------------------------------
[2K[31;1mVERTICES: 00/01  [>>--------------------------] 0%    ELAPSED TIME: 0.