In [2]:
import pandas as pd
import numpy as np
from scipy import sparse

from fast_pagerank import pagerank_power

In [3]:
cleaned_df = pd.read_csv("outputs/cleaned_matched_df.csv")
cleaned_df

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
...,...,...,...,...
1888976,zw.gov.parlzim,zimbabwe parliament provides current bills ord...,Regional,91033765
1888977,zw.org.csz,computer society of zimbabwe aims to encourage...,Regional,91033860
1888978,zw.org.nascoh,national association of societies for the care...,Society,91033952
1888979,zw.org.zlhr,zimbabwe lawyers for human rights zlhr non-pro...,Regional,91034088


In [4]:
edges_df = pd.read_csv("outputs/filtered_edges_df.csv")
edges_df

Unnamed: 0,from_node,to_node
0,362,13833969
1,362,38847411
2,362,88039175
3,362,88492518
4,383,14597310
...,...,...
13868917,91033670,78099047
13868918,91033670,91031377
13868919,91033765,49584102
13868920,91033952,43485988


In [5]:
valid_nodes = cleaned_df["ID"].drop_duplicates().reset_index(drop=True)
valid_nodes

0               362
1               383
2               424
3               536
4               548
             ...   
1808224    91033670
1808225    91033765
1808226    91033860
1808227    91033952
1808228    91034088
Name: ID, Length: 1808229, dtype: int64

## Create graph

In [6]:
# Create indexing for graph (convert ID to IND)
id_node_series = pd.Series(valid_nodes.index, index=valid_nodes.values)
id_node_series

362               0
383               1
424               2
536               3
548               4
             ...   
91033670    1808224
91033765    1808225
91033860    1808226
91033952    1808227
91034088    1808228
Length: 1808229, dtype: int64

In [7]:
dim = len(valid_nodes)
G = sparse.csr_matrix((np.ones(len(edges_df)), 
                       (id_node_series.reindex(edges_df['from_node']), 
                        id_node_series.reindex(edges_df['to_node']))), shape=(dim, dim))

## Create topic biased pagerank vectors (+ personalization vectors)

In [8]:
pr_vectors = valid_nodes.to_frame()
personal_vectors = valid_nodes.to_frame()
personal_vectors

Unnamed: 0,ID
0,362
1,383
2,424
3,536
4,548
...,...
1808224,91033670
1808225,91033765
1808226,91033860
1808227,91033952


In [9]:
alpha = 0.85

In [10]:
for topic, group in cleaned_df.groupby("Topic"):
    personalization_vector = np.zeros(dim, dtype=float)
    personalization_vector[id_node_series.reindex(group["ID"])] = 1 / len(group)
    
    personal_vectors[topic] = personalization_vector

    pr_vectors[topic] = pagerank_power(G, p=alpha, personalize=personalization_vector, tol=1e-9)

In [11]:
personal_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,362,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1,383,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
2,424,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000001
3,536,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
4,548,0.0,0.000000,0.0,0.0,0.000045,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,91033670,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808225,91033765,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808226,91033860,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808227,91033952,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000014,0.0,0.000000


In [12]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,362,0.000000e+00,2.867844e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.790263e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,383,3.552078e-07,2.488685e-07,1.595801e-07,1.558921e-07,4.332948e-07,3.751190e-07,4.472234e-07,2.191070e-07,8.088928e-07,2.022850e-06,3.894907e-07,2.471411e-07,3.662677e-07,2.151713e-07,7.745668e-08
2,424,2.559853e-11,1.772450e-11,1.752886e-11,2.302014e-11,1.102922e-11,1.174043e-11,2.047072e-11,3.880868e-11,1.250364e-11,1.929843e-11,1.920116e-11,3.092021e-11,1.731083e-11,2.109874e-11,6.310186e-07
3,536,7.311693e-11,2.867865e-06,2.769328e-11,3.008064e-11,1.097441e-10,1.080446e-10,2.328975e-11,2.769076e-10,2.336614e-11,3.920863e-09,3.699444e-11,7.537096e-11,4.380096e-11,2.568643e-11,1.368259e-11
4,548,1.151250e-08,1.518159e-08,9.996378e-09,6.776062e-09,1.500085e-05,1.267889e-08,2.493297e-08,8.153355e-09,1.656175e-08,6.066478e-08,1.208807e-08,9.676475e-09,4.752566e-08,1.078235e-08,3.964186e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,91033670,5.578693e-07,2.849916e-07,6.358736e-07,3.031884e-07,2.670043e-07,3.447225e-07,8.843363e-07,2.582276e-07,5.597177e-07,1.426333e-06,7.059648e-07,2.773599e-07,5.215272e-07,1.501077e-06,3.392927e-07
1808225,91033765,5.142124e-07,4.583613e-07,5.854430e-07,4.543513e-07,5.477401e-07,5.744776e-07,1.074367e-06,4.573495e-07,1.536479e-06,1.217541e-06,1.122675e-06,4.459816e-07,1.285956e-06,4.575942e-07,5.541250e-07
1808226,91033860,1.228591e-07,1.068073e-07,2.786091e-07,1.468843e-07,1.096799e-07,1.318228e-07,1.433357e-07,1.154340e-07,1.156378e-07,9.380712e-07,1.068380e-07,1.187885e-07,1.691245e-07,1.294425e-07,1.089917e-07
1808227,91033952,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.348546e-06,0.000000e+00,0.000000e+00


## Create unbiased pagerank vector

In [13]:
pr_vectors["Unbiased"] = pagerank_power(G, p=alpha, tol=1e-9)

In [14]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,2.867844e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.790263e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.011731e-07
1,383,3.552078e-07,2.488685e-07,1.595801e-07,1.558921e-07,4.332948e-07,3.751190e-07,4.472234e-07,2.191070e-07,8.088928e-07,2.022850e-06,3.894907e-07,2.471411e-07,3.662677e-07,2.151713e-07,7.745668e-08,8.037431e-07
2,424,2.559853e-11,1.772450e-11,1.752886e-11,2.302014e-11,1.102922e-11,1.174043e-11,2.047072e-11,3.880868e-11,1.250364e-11,1.929843e-11,1.920116e-11,3.092021e-11,1.731083e-11,2.109874e-11,6.310186e-07,2.582948e-07
3,536,7.311693e-11,2.867865e-06,2.769328e-11,3.008064e-11,1.097441e-10,1.080446e-10,2.328975e-11,2.769076e-10,2.336614e-11,3.920863e-09,3.699444e-11,7.537096e-11,4.380096e-11,2.568643e-11,1.368259e-11,2.025562e-07
4,548,1.151250e-08,1.518159e-08,9.996378e-09,6.776062e-09,1.500085e-05,1.267889e-08,2.493297e-08,8.153355e-09,1.656175e-08,6.066478e-08,1.208807e-08,9.676475e-09,4.752566e-08,1.078235e-08,3.964186e-09,2.380104e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,91033670,5.578693e-07,2.849916e-07,6.358736e-07,3.031884e-07,2.670043e-07,3.447225e-07,8.843363e-07,2.582276e-07,5.597177e-07,1.426333e-06,7.059648e-07,2.773599e-07,5.215272e-07,1.501077e-06,3.392927e-07,7.658747e-07
1808225,91033765,5.142124e-07,4.583613e-07,5.854430e-07,4.543513e-07,5.477401e-07,5.744776e-07,1.074367e-06,4.573495e-07,1.536479e-06,1.217541e-06,1.122675e-06,4.459816e-07,1.285956e-06,4.575942e-07,5.541250e-07,8.163580e-07
1808226,91033860,1.228591e-07,1.068073e-07,2.786091e-07,1.468843e-07,1.096799e-07,1.318228e-07,1.433357e-07,1.154340e-07,1.156378e-07,9.380712e-07,1.068380e-07,1.187885e-07,1.691245e-07,1.294425e-07,1.089917e-07,4.044275e-07
1808227,91033952,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.348546e-06,0.000000e+00,0.000000e+00,2.011731e-07


In [15]:
pr_vectors.to_csv("outputs/pr_vectors_df.csv", index=False)

In [16]:
personal_vectors.to_csv("outputs/personalization_vectors_df.csv", index=False)

In [28]:
edges_df.iloc[np.where(edges_df["to_node"] == 77324560)]

Unnamed: 0,from_node,to_node
116,650,77324560
305,5158,77324560
312,9336,77324560
351,10001,77324560
369,10017,77324560
...,...,...
13868831,91031923,77324560
13868850,91032988,77324560
13868894,91033467,77324560
13868914,91033668,77324560


In [33]:
91032988 in valid_nodes

False

In [36]:
s = []
for n in edges_df["from_node"]:
    if n not in valid_nodes:
        s.append(n)
        
set(s)

{8388608,
 25165824,
 85983235,
 29360135,
 23068680,
 71303178,
 2097166,
 29360142,
 12582928,
 2097169,
 65011730,
 25165844,
 83886101,
 6291478,
 88080404,
 77594648,
 37748762,
 27263004,
 31457309,
 35651614,
 54525980,
 6291490,
 31457318,
 77594664,
 39845930,
 31457330,
 6291508,
 20971575,
 29360183,
 33554487,
 14680122,
 77594680,
 85983291,
 14680125,
 29360189,
 10485823,
 77594689,
 29360199,
 77594695,
 10485833,
 2097230,
 4194382,
 27263056,
 54526034,
 77594709,
 77594710,
 35651677,
 77594717,
 10485855,
 23068768,
 77594721,
 52428898,
 35651683,
 54526051,
 29360229,
 71303269,
 77594730,
 88080490,
 69206124,
 88080492,
 77594734,
 71303279,
 35651698,
 77594740,
 71303285,
 71303288,
 29360249,
 77594749,
 33554558,
 77594751,
 54526090,
 35651726,
 88080526,
 71303313,
 50331796,
 37748885,
 77594778,
 29360288,
 29360290,
 29360291,
 46137506,
 23068837,
 6291628,
 2097326,
 35651759,
 50331825,
 14680247,
 35651768,
 41943228,
 6291646,
 77594817,
 52428995,

In [26]:
np.where(~edges_df['to_node'].isin(valid_nodes))

(array([], dtype=int64),)

In [23]:
edges_df

Unnamed: 0,from_node,to_node
0,362,13833969
1,362,38847411
2,362,88039175
3,362,88492518
4,383,14597310
...,...,...
13868917,91033670,78099047
13868918,91033670,91031377
13868919,91033765,49584102
13868920,91033952,43485988


In [5]:
# import rustworkx

# graph = rx.PyDiGraph()
# graph.add_nodes_from(valid_nodes)

# Need edges list to be in terms of the indicies 
# edges_list = list(zip(id_node_series.reindex(edges_df['from_node']), id_node_series.reindex(edges_df['to_node'])))

# graph.add_edges_from_no_data(edges_list)
# len(graph.nodes())

In [32]:
weights = np.array([0.000000, 0.059181, 0.036517, 0.000000, 0.051956, 0.000000, 0.000000, 0.000000, 0.129435, 
                    0.017112, 0.044625, 0.075449, 0.583880, 0.000000, 0.001845])
weights

array([0.      , 0.059181, 0.036517, 0.      , 0.051956, 0.      ,
       0.      , 0.      , 0.129435, 0.017112, 0.044625, 0.075449,
       0.58388 , 0.      , 0.001845])

In [25]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,1.498642e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.361890e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.148684e-07
1,383,6.393060e-06,8.917722e-06,7.320605e-06,6.801204e-06,8.280223e-06,7.133076e-06,6.111280e-06,8.253913e-06,6.925709e-06,1.019222e-05,6.780863e-06,8.121686e-06,8.750813e-06,8.716358e-06,7.783090e-06,8.619491e-06
2,390,1.522791e-06,9.504839e-07,1.386516e-06,2.670425e-05,9.543138e-07,1.112552e-06,1.063457e-06,1.087097e-06,1.118466e-06,9.674741e-07,1.121593e-06,9.912928e-07,1.109983e-06,9.798989e-07,9.918484e-07,1.151542e-06
3,424,7.858411e-09,2.881648e-09,4.524512e-09,3.438162e-09,2.493980e-09,2.619673e-09,2.556108e-09,2.942114e-09,3.070549e-09,3.185356e-09,3.382968e-09,2.767585e-09,3.680720e-09,2.767798e-09,2.956895e-07,1.304056e-07
4,442,1.443802e-06,1.489337e-06,6.973969e-06,1.549083e-06,1.361431e-06,1.391323e-06,1.199559e-06,1.415722e-06,1.389417e-06,1.483011e-06,1.370871e-06,1.407167e-06,1.465477e-06,1.396929e-06,1.721420e-06,1.719387e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089095,91033860,1.671104e-07,2.085876e-07,2.424794e-07,1.832631e-07,1.738028e-07,1.624322e-07,1.438726e-07,1.834418e-07,1.850094e-07,5.641621e-07,1.972626e-07,1.731439e-07,1.850381e-07,1.742519e-07,2.078488e-07,3.266362e-07
2089096,91033932,2.967283e-07,1.181393e-07,1.885021e-07,2.755727e-07,2.075908e-07,5.334217e-07,1.543356e-07,2.163607e-07,1.573319e-07,5.270430e-07,2.704944e-07,1.793535e-07,3.065950e-07,1.449189e-07,1.133435e-07,2.802482e-07
2089097,91033952,4.177959e-08,4.817609e-08,4.460005e-08,3.859092e-08,5.765525e-08,5.778306e-08,4.950349e-08,4.947143e-08,6.118235e-08,7.779341e-08,6.265003e-08,4.068223e-08,2.217057e-06,4.861505e-08,3.217385e-08,1.656155e-07
2089098,91034085,1.317181e-07,1.437084e-07,4.772864e-06,1.388089e-07,1.302309e-07,1.312568e-07,1.257381e-07,1.280443e-07,1.363033e-07,4.752886e-07,1.342529e-07,1.374969e-07,1.574873e-07,1.293131e-07,1.439012e-07,2.590354e-07


In [38]:
lc_ranks = (weights * pr_vectors.drop(columns=["ID", "Unbiased"])).sum(axis=1)
lc_ranks

0          9.444397e-08
1          8.335285e-06
2          1.092558e-06
3          3.971704e-09
4          1.644943e-06
               ...     
2089095    1.941197e-07
2089096    2.588704e-07
2089097    1.317145e-06
2089098    3.239215e-07
2089099    1.796194e-06
Length: 2089100, dtype: float64

In [44]:
cleaned_df["Ranks"] = lc_ranks
cleaned_df

0                                                        362
1                                                        383
2                                                        390
3                                                        424
4                                                        442
                                 ...                        
2089096                                             91033932
2089097                                             91033952
2089098                                             91034085
2089099                                             91034088
Ranks      0          9.444397e-08
1          8.335285e-0...
Name: ID, Length: 2089101, dtype: object

In [39]:
lc_personal = (weights * personal_vectors.drop(columns=["ID"])).sum(axis=1)
lc_personal

0          3.757246e-07
1          2.501202e-08
2          0.000000e+00
3          1.943470e-09
4          7.113747e-07
               ...     
2089095    2.501202e-08
2089096    2.501202e-08
2089097    5.814032e-06
2089098    7.363868e-07
2089099    5.839044e-06
Length: 2089100, dtype: float64

In [42]:
pagerank_power(G, p=alpha, personalize=lc_personal.to_numpy(), tol=1e-12)

array([8.26899357e-08, 8.32494696e-06, 1.09390504e-06, ...,
       1.33026948e-06, 3.17852633e-07, 1.81072941e-06], shape=(2089100,))

In [58]:
ranked_nodes = lc_ranks.rename(index=valid_nodes).sort_values(ascending=False)
ranked_df = cleaned_df.set_index('ID').loc[ranked_nodes.index]

21860195    0.021713
48714451    0.015185
45612695    0.014011
24230475    0.013406
24229864    0.009535
              ...   
88093163    0.000000
37179248    0.000000
19874554    0.000000
45604211    0.000000
1695963     0.000000
Length: 2089100, dtype: float64

In [60]:
ranked_df = cleaned_df.set_index('ID').loc[ranked_nodes.index]

In [61]:
ranked_df.head(20)

Unnamed: 0,Topic,ReversedDomain,Content
21860195,Arts,com.facebook,"['comic', 'kings', 'a', 'virginia', 'beach', '..."
21860195,Business,com.facebook,"['abc', 'farms', 'pvt', 'ltd', 'dairy', 'farmi..."
21860195,Computers,com.facebook,"['euphnet', 'silicon', 'valley', 'internet', '..."
21860195,Games,com.facebook,"['syracuse', 'board', 'games', 'and', 'shenani..."
21860195,Health,com.facebook,"['riverstone', 'veterinary', 'hospital', 'list..."
21860195,Home,com.facebook,"['parkhill', 'area', 'horticultural', 'society..."
21860195,News,com.facebook,"['impact', 'newspaper', 'published', 'by', 'th..."
21860195,Recreation,com.facebook,"['typewriter', 'collectors', 'alliance', 'face..."
21860195,Reference,com.facebook,"['nus', 'taekwondo', 'club', 'flash', 'animate..."
21860195,Regional,com.facebook,"['collectif', 'arcenciel', 'facebook', 'group'..."
