# HITS (Hyperlink-Induced Topic Search) 
Outputs the authority score and hub score of every vertex, where authority estimates the value of the content of the page and hub estimates the value of its links to other pages.  Added in MADlib 1.13.

In [1]:
%load_ext sql

In [2]:
# Greenplum Database 5.x on GCP - via tunnel
%sql postgresql://gpadmin@localhost:8000/madlib
        
# PostgreSQL local
#%sql postgresql://fmcquillan@localhost:5432/madlib

In [3]:
%sql select madlib.version();
#%sql select version();

1 rows affected.


version
"MADlib version: 1.18.0, git revision: rel/v1.17.0-102-g42d51bc, cmake configuration time: Sun Apr 18 20:26:28 UTC 2021, build type: release, build system: Linux-3.10.0-1160.24.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5"


# 1.  Create vertex and edge tables

In [4]:
%%sql
DROP TABLE IF EXISTS vertex, edge;

CREATE TABLE vertex(
        id INTEGER
        );

CREATE TABLE edge(
        src INTEGER,
        dest INTEGER,
        user_id INTEGER
        );

INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6);

INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 4, 1),
(1, 2, 1),
(1, 3, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 0, 1),
(4, 0, 1),
(5, 6, 1),
(6, 3, 1);

SELECT * from edge ORDER BY src, dest;

Done.
Done.
Done.
7 rows affected.
12 rows affected.
12 rows affected.


src,dest,user_id
0,1,1
0,2,1
0,4,1
1,2,1
1,3,1
2,3,1
2,5,1
2,6,1
3,0,1
4,0,1


# 2.  Compute the HITS scores

In [5]:
%%sql
DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out');          -- Output table of HITS

SELECT * FROM hits_out ORDER BY id;

Done.
1 rows affected.
7 rows affected.


id,authority,hub
0,8.43871829093e-07,0.338306115083
1,0.158459587238,0.527865350448
2,0.40562796969,0.675800764728
3,0.721775835522,3.95111934817e-07
4,0.158459587238,3.95111934817e-07
5,0.316385413093,0.189719957843
6,0.405199928761,0.337944978189


Look at the summary table:

In [6]:
%%sql
SELECT * FROM hits_out_summary;

1 rows affected.


__iterations__
17


# 3. Different number of iterations
Results in different hub and authority scores.

In [7]:
%%sql
DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table of HITS
             3);                   -- Max iteration

SELECT * FROM hits_out ORDER BY id;

Done.
1 rows affected.
7 rows affected.


id,authority,hub
0,0.0865332738778,0.375721659592
1,0.18388320699,0.533118571043
2,0.432666369389,0.654974244425
3,0.703082850257,0.0406185577938
4,0.18388320699,0.0406185577938
5,0.302866458572,0.182783510072
6,0.38939973245,0.330025782074


Look at the summary table: 

In [8]:
%%sql
SELECT * FROM hits_out_summary;

1 rows affected.


__iterations__
3


# 4. Different theshold
Running HITS with a low threshold of 0.00001 results in more iterations for convergence

In [9]:
%%sql
DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table of HITS
             NULL,                 -- Default max_iter
             0.5);             -- Threshold

SELECT * FROM hits_out ORDER BY id;

Done.
1 rows affected.
7 rows affected.


id,authority,hub
0,0.194028500029,0.39062401003
1,0.194028500029,0.528491307688
2,0.436564125065,0.643380722403
3,0.679099750102,0.0919115317719
4,0.194028500029,0.0919115317719
5,0.291042750044,0.183823063544
6,0.388057000058,0.321690361202


Look at the summary table:

In [10]:
%%sql
SELECT * FROM hits_out_summary;

1 rows affected.


__iterations__
2


# 5.  Different number of iterations and threshold

In [11]:
%%sql
DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table
             20,                   -- Default max_iter
             0.00001);             -- Threshold

SELECT * FROM hits_out ORDER BY id;

Done.


InternalError: (psycopg2.errors.InternalError_) plpy.Error: Graph HITS: Vertex table (vertex) is empty! (plpython.c:5038)
CONTEXT:  Traceback (most recent call last):
  PL/Python function "hits", line 21, in <module>
    return hits.hits(**globals())
  PL/Python function "hits", line 98, in hits
  PL/Python function "hits", line 58, in validate_hits_args
  PL/Python function "hits", line 93, in validate_graph_coding
  PL/Python function "hits", line 105, in _assert
PL/Python function "hits"

[SQL: SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table
             20,                   -- Default max_iter
             0.00001);             -- Threshold]
(Background on this error at: http://sqlalche.me/e/2j85)

Look at the summary table.  The algorithm stopped at 20 iterations even though the convergence for threshold of 0.00001 is at 25 iterations. This is because max_iter was set to 20.

In [None]:
%%sql
SELECT * FROM hits_out_summary;

# 6. Grouping
Running HITS with grouping column and default values for max_iter and threshold. Add more rows to the edge table to create different graphs based on the user_id column.

In [None]:
%%sql
INSERT INTO edge VALUES
(0, 1, 2),
(0, 2, 2),
(0, 4, 2),
(1, 2, 2),
(1, 3, 2),
(2, 3, 2),
(3, 0, 2),
(4, 0, 2),
(5, 6, 2),
(6, 3, 2);

DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table
             NULL,                 -- Default max_iter
             NULL,                 -- Threshold
             'user_id');           -- Grouping column

SELECT * FROM hits_out ORDER BY user_id, id;

# 7. Other
Let's check against the output from p. 8 of http://www.cis.hut.fi/Opinnot/T-61.6020/2008/pagerank_hits.pdf

In [None]:
%%sql
DROP TABLE IF EXISTS vertex, edge;

CREATE TABLE vertex(
        id INTEGER
        );

CREATE TABLE edge(
        src INTEGER,
        dest INTEGER,
        user_id INTEGER
        );

INSERT INTO vertex VALUES
(0),
(1),
(2),
(3);

INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 3, 1),
(1, 2, 1),
(1, 3, 1),
(2, 1, 1);

SELECT * from edge ORDER BY src, dest;

In [None]:
%%sql
SELECT * FROM hits_out_summary order by user_id;

In [None]:
%%sql
DROP TABLE IF EXISTS hits_out, hits_out_summary;

SELECT madlib.hits(
             'vertex',             -- Vertex table
             'id',                 -- Vertex id column
             'edge',               -- Edge table
             'src=src, dest=dest', -- Comma delimited string of edge arguments
             'hits_out',           -- Output table of HITS
             100);                   -- Max iteration

SELECT * FROM hits_out ORDER BY id;

Yes ^^^ matches the results from the reference.