In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

from fast_pagerank import pagerank_power
# import rustworkx as rx

In [2]:
cleaned_df = pd.read_csv("cleaned_matched_df.csv")
cleaned_df

Unnamed: 0,ID,Topic,ReversedDomain,Content
0,362,Business,ac.accent,"['accent', 'services', 'uk', 'based', 'full', ..."
1,362,Regional,ac.accent,"['accent', 'services', 'a', 'full', 'service',..."
2,383,Regional,ac.acs,"['anderson', 'county', 'schools', 'public', 's..."
3,390,Games,ac.adamcadre,"['allows', 'the', 'game', 'to', 'be', 'played'..."
4,424,World,ac.aikido,"['langnau', 'aikidogruppe', 'in', 'zusammenarb..."
...,...,...,...,...
2289392,91033952,Society,zw.org.nascoh,"['national', 'association', 'of', 'societies',..."
2289393,91034085,Computers,zw.org.zispa,"['zw', 'domain', 'zimbabwe', 'nic', 'for', 'zw..."
2289394,91034085,Regional,zw.org.zispa,"['zimbabwe', 'internet', 'service', 'providers..."
2289395,91034088,Regional,zw.org.zlhr,"['zimbabwe', 'lawyers', 'for', 'human', 'right..."


In [3]:
edges_df = pd.read_csv("filtered_edges_df.csv")
edges_df

Unnamed: 0,from_node,to_node
0,362,13833969
1,362,21183718
2,362,24229864
3,362,30617333
4,362,34548414
...,...,...
68294058,91034088,45612695
68294059,91034088,48714451
68294060,91034088,66662916
68294061,91034088,67027618


In [4]:
valid_nodes = cleaned_df["ID"].drop_duplicates().reset_index(drop=True)
valid_nodes

0               362
1               383
2               390
3               424
4               442
             ...   
2089095    91033860
2089096    91033932
2089097    91033952
2089098    91034085
2089099    91034088
Name: ID, Length: 2089100, dtype: int64

## Create graph

In [5]:
# Create indexing for graph (convert ID to IND)
id_node_series = pd.Series(valid_nodes.index, index=valid_nodes.values)
id_node_series

362               0
383               1
390               2
424               3
442               4
             ...   
91033860    2089095
91033932    2089096
91033952    2089097
91034085    2089098
91034088    2089099
Length: 2089100, dtype: int64

In [6]:
dim = len(valid_nodes)
G = sparse.csr_matrix((np.ones(len(edges_df)), 
                       (id_node_series.reindex(edges_df['from_node']), 
                        id_node_series.reindex(edges_df['to_node']))), shape=(dim, dim))

## Create topic biased pagerank vectors (+ personalization vectors)

In [7]:
pr_vectors = valid_nodes.to_frame()
personal_vectors = valid_nodes.to_frame()
personal_vectors

Unnamed: 0,ID
0,362
1,383
2,390
3,424
4,442
...,...
2089095,91033860
2089096,91033932
2089097,91033952
2089098,91034085


In [8]:
alpha = 0.85

In [9]:
for topic, group in cleaned_df.groupby("Topic"):
    personalization_vector = np.zeros(dim, dtype=float)
    personalization_vector[id_node_series.reindex(group["ID"])] = 1 / len(group)
    
    personal_vectors[topic] = personalization_vector

    pr_vectors[topic] = pagerank_power(G, p=alpha, personalize=personalization_vector, tol=1e-9)

In [10]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,362,0.000000e+00,1.498642e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.361890e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,383,6.393060e-06,8.917722e-06,7.320605e-06,6.801204e-06,8.280223e-06,7.133076e-06,6.111280e-06,8.253913e-06,6.925709e-06,1.019222e-05,6.780863e-06,8.121686e-06,8.750813e-06,8.716358e-06,7.783090e-06
2,390,1.522791e-06,9.504839e-07,1.386516e-06,2.670425e-05,9.543138e-07,1.112552e-06,1.063457e-06,1.087097e-06,1.118466e-06,9.674741e-07,1.121593e-06,9.912928e-07,1.109983e-06,9.798989e-07,9.918484e-07
3,424,7.858411e-09,2.881648e-09,4.524512e-09,3.438162e-09,2.493980e-09,2.619673e-09,2.556108e-09,2.942114e-09,3.070549e-09,3.185356e-09,3.382968e-09,2.767585e-09,3.680720e-09,2.767798e-09,2.956895e-07
4,442,1.443802e-06,1.489337e-06,6.973969e-06,1.549083e-06,1.361431e-06,1.391323e-06,1.199559e-06,1.415722e-06,1.389417e-06,1.483011e-06,1.370871e-06,1.407167e-06,1.465477e-06,1.396929e-06,1.721420e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089095,91033860,1.671104e-07,2.085876e-07,2.424794e-07,1.832631e-07,1.738028e-07,1.624322e-07,1.438726e-07,1.834418e-07,1.850094e-07,5.641621e-07,1.972626e-07,1.731439e-07,1.850381e-07,1.742519e-07,2.078488e-07
2089096,91033932,2.967283e-07,1.181393e-07,1.885021e-07,2.755727e-07,2.075908e-07,5.334217e-07,1.543356e-07,2.163607e-07,1.573319e-07,5.270430e-07,2.704944e-07,1.793535e-07,3.065950e-07,1.449189e-07,1.133435e-07
2089097,91033952,4.177959e-08,4.817609e-08,4.460005e-08,3.859092e-08,5.765525e-08,5.778306e-08,4.950349e-08,4.947143e-08,6.118235e-08,7.779341e-08,6.265003e-08,4.068223e-08,2.217057e-06,4.861505e-08,3.217385e-08
2089098,91034085,1.317181e-07,1.437084e-07,4.772864e-06,1.388089e-07,1.302309e-07,1.312568e-07,1.257381e-07,1.280443e-07,1.363033e-07,4.752886e-07,1.342529e-07,1.374969e-07,1.574873e-07,1.293131e-07,1.439012e-07


## Create unbiased pagerank vector

In [11]:
pr_vectors["Unbiased"] = pagerank_power(G, p=alpha, tol=1e-9)

In [12]:
pr_vectors

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,1.498642e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.361890e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.148684e-07
1,383,6.393060e-06,8.917722e-06,7.320605e-06,6.801204e-06,8.280223e-06,7.133076e-06,6.111280e-06,8.253913e-06,6.925709e-06,1.019222e-05,6.780863e-06,8.121686e-06,8.750813e-06,8.716358e-06,7.783090e-06,8.619491e-06
2,390,1.522791e-06,9.504839e-07,1.386516e-06,2.670425e-05,9.543138e-07,1.112552e-06,1.063457e-06,1.087097e-06,1.118466e-06,9.674741e-07,1.121593e-06,9.912928e-07,1.109983e-06,9.798989e-07,9.918484e-07,1.151542e-06
3,424,7.858411e-09,2.881648e-09,4.524512e-09,3.438162e-09,2.493980e-09,2.619673e-09,2.556108e-09,2.942114e-09,3.070549e-09,3.185356e-09,3.382968e-09,2.767585e-09,3.680720e-09,2.767798e-09,2.956895e-07,1.304056e-07
4,442,1.443802e-06,1.489337e-06,6.973969e-06,1.549083e-06,1.361431e-06,1.391323e-06,1.199559e-06,1.415722e-06,1.389417e-06,1.483011e-06,1.370871e-06,1.407167e-06,1.465477e-06,1.396929e-06,1.721420e-06,1.719387e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089095,91033860,1.671104e-07,2.085876e-07,2.424794e-07,1.832631e-07,1.738028e-07,1.624322e-07,1.438726e-07,1.834418e-07,1.850094e-07,5.641621e-07,1.972626e-07,1.731439e-07,1.850381e-07,1.742519e-07,2.078488e-07,3.266362e-07
2089096,91033932,2.967283e-07,1.181393e-07,1.885021e-07,2.755727e-07,2.075908e-07,5.334217e-07,1.543356e-07,2.163607e-07,1.573319e-07,5.270430e-07,2.704944e-07,1.793535e-07,3.065950e-07,1.449189e-07,1.133435e-07,2.802482e-07
2089097,91033952,4.177959e-08,4.817609e-08,4.460005e-08,3.859092e-08,5.765525e-08,5.778306e-08,4.950349e-08,4.947143e-08,6.118235e-08,7.779341e-08,6.265003e-08,4.068223e-08,2.217057e-06,4.861505e-08,3.217385e-08,1.656155e-07
2089098,91034085,1.317181e-07,1.437084e-07,4.772864e-06,1.388089e-07,1.302309e-07,1.312568e-07,1.257381e-07,1.280443e-07,1.363033e-07,4.752886e-07,1.342529e-07,1.374969e-07,1.574873e-07,1.293131e-07,1.439012e-07,2.590354e-07


In [13]:
pr_vectors.to_csv("pr_vectors_df.csv", index=False)

In [5]:
# graph = rx.PyDiGraph()
# graph.add_nodes_from(valid_nodes)

# Need edges list to be in terms of the indicies 
# edges_list = list(zip(id_node_series.reindex(edges_df['from_node']), id_node_series.reindex(edges_df['to_node'])))

# graph.add_edges_from_no_data(edges_list)
# len(graph.nodes())

In [14]:
term_vectors_df = pd.read_csv("term_vectors.csv")
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,19,76,6,0,10,0,0,1,2,137,3,41,5,2,110
1,services,779,30568,8031,97,7371,400,134,1478,1857,156151,3359,878,12404,820,16133
2,uk,1876,5894,1014,164,877,214,123,1274,246,8841,1187,1648,1636,1434,168
3,based,5339,12949,4371,552,1385,245,308,2478,442,14008,1493,1599,4172,2415,149
4,full,1577,3723,832,162,1253,136,44,806,222,11928,672,575,919,475,377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428960,sterkinekor,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428961,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428962,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428963,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [15]:
term_vector_totals = term_vectors_df.drop(columns=["Terms"]).sum()
term_vector_totals

Arts           2752549
Business       3439790
Computers      1254315
Games           380098
Health          827734
Home            288867
News            110580
Recreation     1223466
Reference       624425
Regional      16027153
Science        1167547
Shopping        971404
Society        3034022
Sports         1213856
World         18595112
dtype: int64

In [16]:
topics = cleaned_df["Topic"].unique()
topics

array(['Business', 'Regional', 'Games', 'World', 'Computers', 'Health',
       'Science', 'Sports', 'Society', 'Arts', 'Recreation', 'Shopping',
       'Reference', 'Home', 'News'], dtype=object)

In [23]:
def get_weights(query):
    query_list = query.split(" ")
    query_rows = term_vectors_df[term_vectors_df['Terms'].isin(query_list)].drop(columns=['Terms'])

    # =           P(c_j)     * D_j_t1 * D_j_t2 * ... * D_j_tn   /   (Sum_i D_ji)    **    n
    weights = (1 / len(topics)) * query_rows.prod() / (term_vector_totals ** len(query_rows))
    return weights

In [24]:
my_weights = get_weights("affirmative action") 
my_weights

Arts          0.000000e+00
Business      1.751160e-11
Computers     1.080527e-11
Games         0.000000e+00
Health        1.537391e-11
Home          0.000000e+00
News          0.000000e+00
Recreation    0.000000e+00
Reference     3.829977e-11
Regional      5.063528e-12
Science       1.320455e-11
Shopping      2.232524e-11
Society       1.727702e-10
Sports        0.000000e+00
World         5.460148e-13
dtype: float64

In [25]:
my_weights = my_weights / my_weights.sum()
my_weights

Arts          0.000000
Business      0.059181
Computers     0.036517
Games         0.000000
Health        0.051956
Home          0.000000
News          0.000000
Recreation    0.000000
Reference     0.129435
Regional      0.017112
Science       0.044625
Shopping      0.075449
Society       0.583880
Sports        0.000000
World         0.001845
dtype: float64

In [29]:
my_weights.sum()

np.float64(1.0)

In [26]:
combined_personalization = (my_weights * personal_vectors.drop(columns=["ID"])).sum(axis=1)
combined_personalization

0          3.757238e-07
1          2.501244e-08
2          0.000000e+00
3          1.943751e-09
4          7.113673e-07
               ...     
2089095    2.501244e-08
2089096    2.501244e-08
2089097    5.814035e-06
2089098    7.363798e-07
2089099    5.839047e-06
Length: 2089100, dtype: float64

In [27]:
cp = combined_personalization.to_numpy()
cp

array([3.75723810e-07, 2.50124446e-08, 0.00000000e+00, ...,
       5.81403460e-06, 7.36379756e-07, 5.83904704e-06], shape=(2089100,))

In [28]:
a= pagerank_power(G, p=alpha, personalize=cp, tol=1e-9)
a

array([8.26897543e-08, 8.32494819e-06, 1.09390492e-06, ...,
       1.33027002e-06, 3.17851026e-07, 1.81073010e-06], shape=(2089100,))

In [22]:
a * dim

array([ 0.17274717, 17.39164927,  2.28527676, ...,  2.77906711,
        0.66402258,  3.78279626], shape=(2089100,))