# PageRank
The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11.

In [2]:
%load_ext sql

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


In [3]:
# Greenplum 4.3.10.0
#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib
        
# PostgreSQL local
%sql postgresql://fmcquillan@localhost:5432/madlib

# Greenplum 4.2.3.0
#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib

u'Connected: fmcquillan@madlib'

In [3]:
%sql select madlib.version();
#%sql select version();

1 rows affected.


version
"MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0"


# 1.  Create vertex and edge tables

In [4]:
%%sql 
DROP TABLE IF EXISTS vertex, edge;

CREATE TABLE vertex(
        id INTEGER
        );

CREATE TABLE edge(
        src INTEGER,
        dest INTEGER,
        user_id INTEGER
        );

INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6);

INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 4, 1),
(1, 2, 1),
(1, 3, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 0, 1),
(4, 0, 1),
(5, 6, 1),
(6, 3, 1),
(0, 1, 2),
(0, 2, 2),
(0, 4, 2),
(1, 2, 2),
(1, 3, 2),
(2, 3, 2),
(3, 0, 2),
(4, 0, 2),
(5, 6, 2),
(6, 3, 2);

SELECT * from edge ORDER BY src;

Done.
Done.
Done.
7 rows affected.
22 rows affected.
22 rows affected.


src,dest,user_id
0,1,1
0,2,2
0,2,1
0,4,2
0,4,1
0,1,2
1,3,1
1,3,2
1,2,2
1,2,1


# 2.  Calculate the PageRank

In [17]:
%%sql
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;

SELECT madlib.pagerank(
                         'vertex',             -- Vertex table
                         'id',                 -- Vertix id column
                         'edge',               -- Edge table
                         'src=src, dest=dest', -- Comma delimted string of edge arguments
                         'pagerank_out');      -- Output table of PageRank

SELECT * FROM pagerank_out ORDER BY pagerank DESC;

Done.
1 rows affected.
7 rows affected.


id,pagerank
0,0.287537493412
3,0.210169889019
2,0.146626834541
4,0.102896143842
1,0.102896143842
6,0.0972863776889
5,0.0525871176569


Look at the summary table:

In [18]:
%%sql
SELECT * FROM pagerank_out_summary;

1 rows affected.


__iterations__
16


Now run PageRank with a damping factor of 0.5 which results in different final values:

In [21]:
%%sql
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
SELECT madlib.pagerank(
                         'vertex',             -- Vertex table
                         'id',                 -- Vertix id column
                         'edge',               -- Edge table
                         'src=src, dest=dest', -- Comma delimted string of edge arguments
                         'pagerank_out',       -- Output table of PageRank
                         0.5);                 -- Damping factor
SELECT * FROM pagerank_out ORDER BY pagerank DESC;

Done.
1 rows affected.
7 rows affected.


id,pagerank
0,0.225477161441
3,0.199090328587
2,0.136261327206
6,0.132691559968
4,0.10900929141
1,0.10900929141
5,0.0884610399788


# 3. Grouping

Now compute the PageRank distribution separately for each user using the grouping feature:

In [22]:
%%sql
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;

SELECT madlib.pagerank(
                         'vertex',             -- Vertex table
                         'id',                 -- Vertix id column
                         'edge',               -- Edge table
                         'src=src, dest=dest', -- Comma delimted string of edge arguments
                         'pagerank_out',       -- Output table of PageRank
                         NULL,                 -- Default damping factor (0.85)
                         NULL,                 -- Default max iters (100)
                         0.00000001,           -- Threshold
                         'user_id');           -- Grouping column name

SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;

Done.
1 rows affected.
14 rows affected.


user_id,id,pagerank
1,0,0.278254883886
1,3,0.201881146671
1,2,0.142881123461
1,6,0.114536378321
1,4,0.100267456154
1,1,0.100267456154
1,5,0.0619115553529
2,0,0.318546250042
2,3,0.237866867733
2,2,0.159148764894


In [24]:
%%sql
SELECT * FROM pagerank_out_summary ORDER BY user_id;

2 rows affected.


user_id,__iterations__
1,27
2,31
