In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

from fast_pagerank import pagerank_power

In [55]:
cleaned_df = pd.read_csv("outputs/cleaned_matched_df.csv")
cleaned_df

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
...,...,...,...,...
1887205,zw.gov.parlzim,zimbabwe parliament provides current bills ord...,Regional,91033765
1887206,zw.org.csz,computer society of zimbabwe aims to encourage...,Regional,91033860
1887207,zw.org.nascoh,national association of societies for the care...,Society,91033952
1887208,zw.org.zlhr,zimbabwe lawyers for human rights zlhr non-pro...,Regional,91034088


In [56]:
edges_df = pd.read_csv("outputs/filtered_edges_df.csv")
edges_df

Unnamed: 0,from_node,to_node
0,362,13833969
1,362,38847411
2,362,88039175
3,362,88492518
4,383,20737647
...,...,...
10710334,91033547,91031677
10710335,91033668,78311432
10710336,91033670,30651887
10710337,91033670,78099047


In [57]:
valid_nodes = cleaned_df["ID"].drop_duplicates().reset_index(drop=True)
valid_nodes

0               362
1               383
2               424
3               536
4               548
             ...   
1806658    91033670
1806659    91033765
1806660    91033860
1806661    91033952
1806662    91034088
Name: ID, Length: 1806663, dtype: int64

## Create graph

In [58]:
# Create indexing for graph (convert ID to IND)
id_node_series = pd.Series(valid_nodes.index, index=valid_nodes.values)
id_node_series

362               0
383               1
424               2
536               3
548               4
             ...   
91033670    1806658
91033765    1806659
91033860    1806660
91033952    1806661
91034088    1806662
Length: 1806663, dtype: int64

In [59]:
dim = len(valid_nodes)
G = sparse.csr_matrix((np.ones(len(edges_df)), 
                       (id_node_series.reindex(edges_df['from_node']), 
                        id_node_series.reindex(edges_df['to_node']))), shape=(dim, dim))

## Create topic biased pagerank vectors (+ personalization vectors)

In [60]:
pr_vectors = valid_nodes.to_frame()
personal_vectors = valid_nodes.to_frame()
personal_vectors

Unnamed: 0,ID
0,362
1,383
2,424
3,536
4,548
...,...
1806658,91033670
1806659,91033765
1806660,91033860
1806661,91033952


In [77]:
alpha = 0.85

In [78]:
for topic, group in cleaned_df.groupby("Topic"):
    personalization_vector = np.zeros(dim, dtype=float)
    personalization_vector[id_node_series.reindex(group["ID"])] = 1 / len(group)
    
    personal_vectors[topic] = personalization_vector

    pr_vectors[topic] = pagerank_power(G, p=alpha, personalize=personalization_vector, tol=1e-9)

In [79]:
personal_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,362,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1,383,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
2,424,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000001
3,536,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
4,548,0.0,0.000000,0.0,0.0,0.000045,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806658,91033670,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1806659,91033765,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1806660,91033860,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1806661,91033952,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000014,0.0,0.000000


In [80]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,3.514615e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,7.120904e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.843064e-07
1,383,5.597053e-07,3.552723e-07,2.010359e-07,2.133804e-07,7.863006e-07,2.626718e-07,6.337888e-07,2.583277e-07,1.164248e-06,2.959443e-06,4.822334e-07,3.753983e-07,6.146538e-07,2.808560e-07,7.601794e-08,7.135922e-07
2,424,3.112269e-13,1.677991e-13,2.404075e-13,4.515706e-13,1.509383e-13,1.555003e-13,1.106721e-13,3.578369e-13,2.173899e-12,1.893824e-13,3.505343e-13,9.316526e-14,1.611459e-13,6.858822e-14,7.504387e-07,5.246871e-07
3,536,1.272421e-10,3.514664e-06,6.091308e-11,6.060924e-11,2.573289e-10,1.912966e-10,4.462740e-11,6.526737e-10,4.837806e-11,5.608013e-09,5.369414e-11,1.703050e-10,9.858454e-11,5.930094e-11,1.870162e-11,4.852900e-07
4,548,1.795956e-08,2.632592e-08,1.475142e-08,8.555678e-09,1.796263e-05,1.950480e-08,3.549078e-08,1.172918e-08,2.637398e-08,9.671950e-08,2.012616e-08,1.443037e-08,9.495881e-08,1.840844e-08,3.019084e-09,4.972454e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806658,91033670,6.493215e-07,3.230436e-07,5.175131e-07,1.873141e-07,2.103202e-07,4.300507e-07,2.505627e-06,2.079758e-07,4.048397e-07,1.809694e-06,8.359374e-07,1.769373e-07,1.071875e-06,3.029044e-06,3.745815e-07,6.067811e-07
1806659,91033765,4.875689e-07,4.617514e-07,5.634861e-07,3.415081e-07,5.329493e-07,6.543673e-07,2.022528e-06,4.234362e-07,2.130218e-06,1.485457e-06,1.343769e-06,4.607379e-07,1.803882e-06,4.203868e-07,5.478937e-07,5.441593e-07
1806660,91033860,1.406841e-07,1.051764e-07,5.643548e-07,3.364911e-07,1.296299e-07,9.773312e-08,8.401201e-08,9.460601e-08,1.006267e-07,1.150795e-06,9.514858e-08,1.341724e-07,2.637692e-07,1.960867e-07,1.042871e-07,5.509637e-07
1806661,91033952,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.306737e-06,0.000000e+00,0.000000e+00,4.843064e-07


## Create unbiased pagerank vector

In [81]:
pr_vectors["Unbiased"] = pagerank_power(G, p=alpha, tol=1e-9)

In [82]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,3.514615e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,7.120904e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.430100e-07
1,383,5.597053e-07,3.552723e-07,2.010359e-07,2.133804e-07,7.863006e-07,2.626718e-07,6.337888e-07,2.583277e-07,1.164248e-06,2.959443e-06,4.822334e-07,3.753983e-07,6.146538e-07,2.808560e-07,7.601794e-08,1.140551e-06
2,424,3.112269e-13,1.677991e-13,2.404075e-13,4.515706e-13,1.509383e-13,1.555003e-13,1.106721e-13,3.578369e-13,2.173899e-12,1.893824e-13,3.505343e-13,9.316526e-14,1.611459e-13,6.858822e-14,7.504387e-07,3.119907e-07
3,536,1.272421e-10,3.514664e-06,6.091308e-11,6.060924e-11,2.573289e-10,1.912966e-10,4.462740e-11,6.526737e-10,4.837806e-11,5.608013e-09,5.369414e-11,1.703050e-10,9.858454e-11,5.930094e-11,1.870162e-11,2.449658e-07
4,548,1.795956e-08,2.632592e-08,1.475142e-08,8.555678e-09,1.796263e-05,1.950480e-08,3.549078e-08,1.172918e-08,2.637398e-08,9.671950e-08,2.012616e-08,1.443037e-08,9.495881e-08,1.840844e-08,3.019084e-09,3.001433e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806658,91033670,6.493215e-07,3.230436e-07,5.175131e-07,1.873141e-07,2.103202e-07,4.300507e-07,2.505627e-06,2.079758e-07,4.048397e-07,1.809694e-06,8.359374e-07,1.769373e-07,1.071875e-06,3.029044e-06,3.745815e-07,9.642342e-07
1806659,91033765,4.875689e-07,4.617514e-07,5.634861e-07,3.415081e-07,5.329493e-07,6.543673e-07,2.022528e-06,4.234362e-07,2.130218e-06,1.485457e-06,1.343769e-06,4.607379e-07,1.803882e-06,4.203868e-07,5.478937e-07,9.307183e-07
1806660,91033860,1.406841e-07,1.051764e-07,5.643548e-07,3.364911e-07,1.296299e-07,9.773312e-08,8.401201e-08,9.460601e-08,1.006267e-07,1.150795e-06,9.514858e-08,1.341724e-07,2.637692e-07,1.960867e-07,1.042871e-07,4.817021e-07
1806661,91033952,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.306737e-06,0.000000e+00,0.000000e+00,2.430100e-07


In [83]:
pr_vectors.to_csv("outputs/pr_vectors_df.csv", index=False)

In [84]:
personal_vectors.to_csv("outputs/personalization_vectors_df.csv", index=False)