In [22]:
%%writefile query1.hql

USE stackoverflow_;

DROP TABLE if exists posts_sample_external; 

CREATE EXTERNAL TABLE posts_sample_external 
(year string,
month string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' 
WITH SERDEPROPERTIES (
    "input.regex" = '.*?(?=.*\\bCreationDate=\"(\\d+)-(\\d+)).*'
)
STORED AS TEXTFILE
LOCATION '/data/stackexchange1000/posts/';

Overwriting query1.hql


In [23]:
%%writefile query2.hql

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=1000;

USE stackoverflow_;

DROP TABLE if exists posts_sample; 

CREATE TABLE posts_sample 
(count int) 
PARTITIONED BY (year string, month string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

FROM posts_sample_external
INSERT OVERWRITE TABLE posts_sample
PARTITION (year, month)
SELECT count(*) as count, year, concat(year,"-",month) as month
GROUP BY year, month;

Overwriting query2.hql


In [24]:
%%writefile query3.hql

USE stackoverflow_;

SELECT year, month, count FROM (
    SELECT * FROM (
        SELECT
            count,
            year,
            month,
            ROW_NUMBER() OVER(ORDER BY year, month ASC) as rowid
        FROM posts_sample
    ) AS q1
    ORDER BY rowid ASC
) AS q2
WHERE rowid='3';

Overwriting query3.hql


In [25]:
! hive -f query1.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.165 seconds
OK
Time taken: 1.603 seconds
OK
Time taken: 0.574 seconds


In [26]:
! hive -f query2.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.151 seconds
OK
Time taken: 5.291 seconds
OK
Time taken: 0.74 seconds
Query ID = jovyan_20180517071414_fd55eab9-d9a5-4f5b-adc5-5785c5597aaa
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1526532225389_0022, Tracking URL = http://d26f10570c8c:8088/proxy/application_1526532225389_0022/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1526532225389_0022
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2018-05-17 07:14:28,738 Stage-1 map = 0%, 

Partition stackoverflow_.posts_sample{year=2010, month=2010-03} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-04} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-05} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-06} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-07} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-08} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-09} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2010, month=2010-10} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stacko

Partition stackoverflow_.posts_sample{year=2016, month=2016-09} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2016, month=2016-10} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2016, month=2016-11} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=2016, month=2016-12} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
Partition stackoverflow_.posts_sample{year=__HIVE_DEFAULT_PARTITION__, month=__HIVE_DEFAULT_PARTITION__} stats: [numFiles=1, numRows=1, totalSize=4, rawDataSize=3]
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 27.69 sec   HDFS Read: 60007299 HDFS Write: 7619 SUCCESS
Total MapReduce CPU Time Spent: 27 seconds 690 msec
OK
Time taken: 70.966 seconds


In [27]:
! hive -f query3.hql 2> out.log

2008	2008-10	73
