In [1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn

TIMEOUT = 300
PROG = "hive"
PROMPT = ["\r\n    > ", "\r\nhive> "]
QUIT = "quit;"


@magics_class
class Magic(Magics):
    def __init__(self, shell):
        super().__init__(shell)
        self.app = spawn(PROG, timeout=60)
        self.app.expect(PROMPT)

    @cell_magic
    def hive(self, line, cell):
        cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
        cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
        for cell_line in cell_lines:
            self.app.sendline(cell_line)
            self.app.expect(PROMPT, timeout=TIMEOUT)
            output = self.app.before.decode()
            output = output.replace("\r\n", "\n")
            output = output.split("\n")
            output = [output_line.strip() for output_line in output]
            for output_line in output:
                if output_line not in cell_lines:
                    print(output_line)
        return None

    @line_magic
    def quit(self, line):
        self.app.sendline(QUIT)


def load_ipython_extension(ip):
    ip.register_magics(Magic(ip))


load_ipython_extension(ip=get_ipython())

In [2]:
!hdfs dfs -copyFromLocal pregunta_02/data.tsv /tmp

copyFromLocal: `/tmp/data.tsv': File exists


In [3]:
!hdfs dfs -ls /tmp

Found 3 items
-rw-r--r--   1 root supergroup        677 2023-05-31 02:45 /tmp/data.tsv
drwxrwx---   - root supergroup          0 2023-05-30 16:19 /tmp/hadoop-yarn
drwxrwxrwx   - root supergroup          0 2023-05-30 16:21 /tmp/hive


In [None]:
!hdfs dfs -rm /tmp/data.tsv

In [4]:
%%hive
DROP TABLE IF EXISTS data;

OK
Time taken: 8.376 seconds


In [5]:
%%hive
CREATE TABLE data (
letter STRING,
date_event STRING,
value INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY  '\t';

OK
Time taken: 0.671 seconds


In [6]:
%%hive
LOAD DATA INPATH '/tmp/data.tsv' OVERWRITE INTO TABLE data;
SELECT * FROM data;

Loading data to table default.data
OK
Time taken: 0.759 seconds
OK
B	1999-08-28	14
E	1999-12-06	12
E	1993-07-21	17
C	1991-02-12	13
E	1995-04-25	16
A	1992-08-22	14
B	1999-06-11	12
E	1993-01-27	13
E	1999-09-10	11
E	1990-05-03	16
E	1994-02-14	5
A	1988-04-27	12
A	1990-10-06	10
E	1985-02-12	16
E	1998-09-14	16
B	1994-08-30	17
A	1997-12-15	13
B	1995-08-23	10
B	1998-11-22	13
B	1997-04-09	14
E	1993-12-27	18
E	1999-01-14	15
A	1992-09-19	18
B	1993-03-02	14
B	1999-10-21	13
A	1990-08-31	12
C	1994-01-25	6
E	1990-02-09	18
A	1990-09-26	14
A	1993-05-08	16
B	1995-09-06	14
E	1991-02-18	14
A	1993-01-11	14
A	1990-07-22	18
C	1994-09-09	15
C	1994-07-27	7
D	1990-10-10	15
A	1990-09-05	11
B	1991-10-01	15
A	1994-10-25	13
Time taken: 1.645 seconds, Fetched: 40 row(s)


In [7]:
%%hive
SELECT DISTINCT(value)
FROM data
ORDER BY value LIMIT 5;

Query ID = root_20230531174657_4d312f1b-d5c0-4bfe-919e-8ecb31d069d6
Total jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1685463534098_0002, Tracking URL = http://a0b990b2c7b2:8088/proxy/application_1685463534098_0002/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1685463534098_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2023-05-31 17:47:09,957 Stage-1 map = 0%,  reduce = 0%
2023-05-31 17:47:15,211 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.42 sec
2023-05-31 17:47:21,466 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 4.83 sec
MapReduce Total cumulative CPU time: 4 seconds 830

In [9]:
!hdfs dfs -copyFromLocal pregunta_04/data0.csv /tmp

In [11]:
!hdfs dfs -copyFromLocal pregunta_04/data1.csv /tmp

In [12]:
!hdfs dfs -ls /tmp

Found 5 items
-rw-r--r--   1 root supergroup        677 2023-05-31 18:31 /tmp/data.tsv
-rw-r--r--   1 root supergroup        397 2023-05-31 18:31 /tmp/data0.csv
-rw-r--r--   1 root supergroup        282 2023-05-31 18:32 /tmp/data1.csv
drwxrwx---   - root supergroup          0 2023-05-30 16:19 /tmp/hadoop-yarn
drwxrwxrwx   - root supergroup          0 2023-05-30 16:21 /tmp/hive


In [19]:
%%hive
DROP TABLE IF EXISTS tbl0;
CREATE TABLE tbl0 (
    c1 INT,
    c2 STRING,
    c3 INT,
    c4 DATE,
    c5 ARRAY<CHAR(1)>, 
    c6 MAP<STRING, INT>
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY ':'
MAP KEYS TERMINATED BY '#'
LINES TERMINATED BY '\n';
LOAD DATA INPATH '/tmp/data0.csv' INTO TABLE tbl0;

OK
Time taken: 0.079 seconds
OK
Time taken: 0.068 seconds
Loading data to table default.tbl0
OK
Time taken: 0.231 seconds


In [16]:
%%hive
SHOW TABLES;

OK
data
tbl0
Time taken: 0.045 seconds, Fetched: 2 row(s)


In [80]:
%%hive
SELECT SUBSTRING(c4,0,4) as c4, letras, count(1) FROM tbl0
LATERAL VIEW
    explode(c5) tbl0 as letras
GROUP BY c4, letras;

Query ID = root_20230531223206_871e5d43-516b-4ecc-aa48-ccda0f88a75a
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1685463534098_0008, Tracking URL = http://a0b990b2c7b2:8088/proxy/application_1685463534098_0008/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1685463534098_0008
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2023-05-31 22:32:12,885 Stage-1 map = 0%,  reduce = 0%
2023-05-31 22:32:25,527 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.82 sec
2023-05-31 22:32:30,762 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 7.13 sec
MapReduce Total cumulative CPU time: 7 seconds 130

In [48]:
%%hive
SELECT DISTINCT(letras) as let
FROM(
SELECT
explode(c5) as letras
FROM
tbl0
) w;
ORDER BY let

Query ID = root_20230531213843_eab47cdf-4e55-4069-91cd-184bbe68bab3
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1685463534098_0006, Tracking URL = http://a0b990b2c7b2:8088/proxy/application_1685463534098_0006/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1685463534098_0006
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2023-05-31 21:38:48,784 Stage-1 map = 0%,  reduce = 0%
2023-05-31 21:38:55,126 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 4.16 sec
2023-05-31 21:39:01,382 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 6.65 sec
MapReduce Total cumulative CPU time: 6 seconds 650

In [42]:
%%hive
SELECT explode(split(c5, ',')) AS word FROM tbl0 LIMIT 5;

FAILED: ClassCastException org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector cannot be cast to org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector
