In [49]:
%%writefile query1.hql

USE stackoverflow_;

DROP TABLE if exists posts_sample_external; 

CREATE EXTERNAL TABLE posts_sample_external 
(row_id string,
post_type_id string,
year string,
month string)
ROW FORMAT 
SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' 
WITH SERDEPROPERTIES (
    "input.regex" = ".*?(?=.*\\bId=\"(\\d+)\")(?=.*\\bPostTypeId=\"(\\d+)\")(?=.*\\bCreationDate=\"(\\d+)-(\\d+)).*"
)
LOCATION '/data/stackexchange1000/posts';

Overwriting query1.hql


In [50]:
%%writefile query2.hql

USE stackoverflow_;

DROP TABLE if exists posts_sample; 

CREATE TABLE posts_sample 
(count int) 
PARTITIONED BY (year string, month string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

Overwriting query2.hql


In [51]:
%%writefile query3.hql

SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=2000;
SET hive.exec.max.dynamic.partitions.pernode=1000;
SET hive.exec.max.created.files=10000;
SET hive.error.on.empty.partition=true;

USE stackoverflow_;

FROM posts_sample_external
INSERT OVERWRITE TABLE posts_sample
PARTITION (year, month)
SELECT count(*) as count, year, concat(year,"-",month) as month
WHERE year IS NOT NULL
GROUP BY year, concat(year,"-",month);

Overwriting query3.hql


In [52]:
%%writefile final_query.hql

USE stackoverflow_;

SELECT final_table.year, final_table.month, final_table.count 
FROM (
    SELECT ROW_NUMBER() OVER(ORDER BY year, month ASC) AS row_id, year, month, count
    FROM posts_sample
) final_table
WHERE final_table.row_id=3;

Overwriting final_query.hql


In [53]:
! hive -f query1.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.124 seconds
OK
Time taken: 1.657 seconds
OK
Time taken: 0.597 seconds


In [54]:
! hive -f query2.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.117 seconds
OK
Time taken: 5.036 seconds
OK
Time taken: 0.745 seconds


In [55]:
! hive -f query3.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.229 seconds
Query ID = jovyan_20180515121616_c41f6824-3cb8-43a6-870d-97def8ee30e6
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1526339522385_0011, Tracking URL = http://e42372799f14:8088/proxy/application_1526339522385_0011/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1526339522385_0011
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2018-05-15 12:16:58,453 Stage-1 map = 0%,  reduce = 0%
2018-05-15 12:17:17,003 Stage-1 map = 36%,  

Partition stackoverflow_.posts_sample{year=2010, month=2010-06} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-07} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-08} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-09} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-10} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-11} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-12} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2011, month=2011-01} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stacko

MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 36.48 sec   HDFS Read: 60007726 HDFS Write: 7502 SUCCESS
Total MapReduce CPU Time Spent: 36 seconds 480 msec
OK
Time taken: 82.23 seconds


In [56]:
! hive -f final_query.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.13 seconds
Query ID = jovyan_20180515121818_fb71fbe6-abb2-4eed-b609-d48d0d38ebd8
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1526339522385_0012, Tracking URL = http://e42372799f14:8088/proxy/application_1526339522385_0012/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1526339522385_0012
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2018-05-15 12:18:34,054 Stage-1 map = 0%,  reduce = 0%
2018-05-15 12:18:41,826 Stage-1 map = 100%,  