In [15]:
%%writefile create_db.hql

DROP DATABASE IF EXISTS stackoverflow CASCADE;
CREATE DATABASE stackoverflow LOCATION '/user/hdjudge/store/stackoverflow';
USE stackoverflow;

Overwriting create_db.hql


In [16]:
%%writefile external_table.hql

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar;

DROP TABLE IF EXISTS posts_sample_external;

CREATE EXTERNAL TABLE posts_sample_external (
    id INT,
    post_type_id TINYINT,
    date STRING,
    owner_user_id INT,
    parent_id INT,
    score INT,
    favorite_count INT,
    tags STRING    
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES
('input.regex'='.*(?=\\bId=\"(\\d+)\")(?=.*\\bPostTypeId=\"(\\d+)\")(?=.*\\bCreationDate=\"(.{23})\")(?=.*\\bOwnerUserId=\"(\\d+)\")?(?=.*\\bParentId=\"(\\d+)\")?(?=.*\\bScore=\"(\\d+)\")(?=.*\\bFavoriteCount=\"(\\d+)\")?(?=.*\\bTags=\"(&lt\;.*gt\;)\")?.*')
LOCATION '/data/stackexchange1000/posts';


Overwriting external_table.hql


In [26]:
%%writefile sample.hql

set hive.exec.dynamic.partition.mode=nonstrict

DROP TABLE IF EXISTS posts_sample;

CREATE TABLE posts_sample (
    id INT,
    post_type_id TINYINT,
    owner_user_id INT,
    parent_id INT,
    score INT,
    favorite_count INT,
    tags ARRAY<STRING>
    
) 
PARTITIONED BY (year STRING, month STRING)
STORED AS SEQUENCEFILE;

FROM posts_sample_external se
INSERT OVERWRITE TABLE posts_sample PARTITION(year, month)
    select se.id, se.post_type_id, se.owner_user_id, se.parent_id, se.score, se.favorite_count,split(se.tags,"&lt\;|&gt\;"),substr(se.date,0,4), substr(se.date,6,2)
    WHERE se.id IS NOT NULL;


Overwriting sample.hql


In [27]:
%%writefile count.hql

INSERT OVERWRITE LOCAL DIRECTORY 'result' row format delimited fields terminated by '\t' stored as textfile
SELECT year,concat(year, "-", month),count(*) FROM posts_sample group by year,month LIMIT 3;

Overwriting count.hql


In [None]:
!hive -f create_db.hql
!hive -f external_table.hql
!hive -f sample.hql
!hive -f count.hql
!cat result/*|head -1


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
OK
Time taken: 1.683 seconds
OK
Time taken: 0.348 seconds
OK
Time taken: 0.025 seconds

Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar]
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar]
OK
Time taken: 2.885 seconds
OK
Time taken: 0.595 seconds

Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
