In [1]:
import pandas as pd
import numpy as np

# Load in preprocessed data

In [5]:
pagelookup_df = pd.read_csv("outputs/pagelookup_df.csv")
pagelookup_df

Unnamed: 0,ReversedDomain,URL,Title,Description,ID
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,362
1,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,383
2,ac.aikido,http://aikido.ac/langnau/index.php,Langnau - Aikidogruppe,Einführungskurse in Zusammenarbeit mit der Vol...,424
3,ac.apec,http://www.apec.ac/,APEC,Birmingham practice provides information on co...,536
4,ac.apt,http://www.apt.ac/,The Association for Psychological Therapies,An independent organization offering accredite...,548
...,...,...,...,...,...
1808224,zw.co.zol,http://www.zol.co.zw/,Zimbabwe Online,Dial-up ISP.,91033670
1808225,zw.gov.parlzim,http://www.parlzim.gov.zw/,Zimbabwe Parliament,"Provides current Bills, order paper, Hansard t...",91033765
1808226,zw.org.csz,http://www.csz.org.zw/,Computer Society of Zimbabwe,Aims to encourage research and development in ...,91033860
1808227,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,91033952


In [6]:
cleaned_df = pd.read_csv("outputs/cleaned_matched_df.csv")
cleaned_df

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
...,...,...,...,...
1888976,zw.gov.parlzim,zimbabwe parliament provides current bills ord...,Regional,91033765
1888977,zw.org.csz,computer society of zimbabwe aims to encourage...,Regional,91033860
1888978,zw.org.nascoh,national association of societies for the care...,Society,91033952
1888979,zw.org.zlhr,zimbabwe lawyers for human rights zlhr non-pro...,Regional,91034088


In [7]:
valid_nodes = cleaned_df["ID"].drop_duplicates().reset_index(drop=True)
valid_nodes

0               362
1               383
2               424
3               536
4               548
             ...   
1808224    91033670
1808225    91033765
1808226    91033860
1808227    91033952
1808228    91034088
Name: ID, Length: 1808229, dtype: int64

In [8]:
topics = cleaned_df["Topic"].unique()
topics

array(['Business', 'Regional', 'World', 'Health', 'Science', 'Sports',
       'Society', 'Arts', 'Computers', 'Recreation', 'Shopping',
       'Reference', 'Games', 'Home', 'News'], dtype=object)

In [9]:
term_vectors_df = pd.read_csv("outputs/term_vectors.csv")
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,18,69,6,0,4,0,0,1,2,113,2,33,3,2,70
1,services,556,24694,5796,80,5845,294,77,1119,785,126547,2527,762,8888,653,8714
2,uk,1352,4714,708,114,486,143,56,922,126,5788,594,1360,788,535,91
3,based,3808,10849,2961,268,1071,154,210,1910,264,10029,951,1384,3024,1780,124
4,full,265,3095,408,42,636,30,12,644,92,9464,218,475,401,371,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198962,government-controlled,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198963,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198964,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198965,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [10]:
term_vector_totals = term_vectors_df.drop(columns=["Terms"]).sum()
term_vector_totals

Arts           1026946
Business       2673147
Computers       639011
Games           122926
Health          484173
Home            109627
News             60290
Recreation      818176
Reference       243254
Regional      10746814
Science         452854
Shopping        808629
Society        1401758
Sports          634039
World         12130959
dtype: int64

In [11]:
personal_vectors_df = pd.read_csv("outputs/personalization_vectors_df.csv")
personal_vectors_df

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,362,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1,383,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
2,424,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000001
3,536,0.0,0.000007,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
4,548,0.0,0.000000,0.0,0.0,0.000045,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,91033670,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808225,91033765,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808226,91033860,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000002,0.0,0.0,0.000000,0.0,0.000000
1808227,91033952,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000014,0.0,0.000000


In [12]:
pr_vectors_df = pd.read_csv("outputs/pr_vectors_df.csv")
pr_vectors_df

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,362,0.000000e+00,2.867844e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.790263e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.011731e-07
1,383,3.552078e-07,2.488685e-07,1.595801e-07,1.558921e-07,4.332948e-07,3.751190e-07,4.472234e-07,2.191070e-07,8.088928e-07,2.022850e-06,3.894907e-07,2.471411e-07,3.662677e-07,2.151713e-07,7.745668e-08,8.037431e-07
2,424,2.559853e-11,1.772450e-11,1.752886e-11,2.302014e-11,1.102922e-11,1.174043e-11,2.047072e-11,3.880868e-11,1.250364e-11,1.929843e-11,1.920116e-11,3.092021e-11,1.731083e-11,2.109874e-11,6.310186e-07,2.582948e-07
3,536,7.311693e-11,2.867865e-06,2.769328e-11,3.008064e-11,1.097441e-10,1.080446e-10,2.328975e-11,2.769076e-10,2.336614e-11,3.920863e-09,3.699444e-11,7.537096e-11,4.380096e-11,2.568643e-11,1.368259e-11,2.025562e-07
4,548,1.151250e-08,1.518159e-08,9.996378e-09,6.776062e-09,1.500085e-05,1.267889e-08,2.493297e-08,8.153355e-09,1.656175e-08,6.066478e-08,1.208807e-08,9.676475e-09,4.752566e-08,1.078235e-08,3.964186e-09,2.380104e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,91033670,5.578693e-07,2.849916e-07,6.358736e-07,3.031884e-07,2.670043e-07,3.447225e-07,8.843363e-07,2.582276e-07,5.597177e-07,1.426333e-06,7.059648e-07,2.773599e-07,5.215272e-07,1.501077e-06,3.392927e-07,7.658747e-07
1808225,91033765,5.142124e-07,4.583613e-07,5.854430e-07,4.543513e-07,5.477401e-07,5.744776e-07,1.074367e-06,4.573495e-07,1.536479e-06,1.217541e-06,1.122675e-06,4.459816e-07,1.285956e-06,4.575942e-07,5.541250e-07,8.163580e-07
1808226,91033860,1.228591e-07,1.068073e-07,2.786091e-07,1.468843e-07,1.096799e-07,1.318228e-07,1.433357e-07,1.154340e-07,1.156378e-07,9.380712e-07,1.068380e-07,1.187885e-07,1.691245e-07,1.294425e-07,1.089917e-07,4.044275e-07
1808227,91033952,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.348546e-06,0.000000e+00,0.000000e+00,2.011731e-07


In [13]:
# # pr_vectors_df = pd.read_csv("pr_vectors_df.csv")
# pr_vectors_df * 2089099

Unnamed: 0,ID,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World,Unbiased
0,756253838,0.000000,5.991210,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.209643,0.000000,0.000000,0.000000,0.000000,0.000000,0.420271
1,800124917,0.742064,0.519911,0.333379,0.325674,0.905196,0.783661,0.934294,0.457736,1.689857,4.225934,0.813685,0.516302,0.765170,0.449514,0.161815,1.679099
2,885777976,0.000053,0.000037,0.000037,0.000048,0.000023,0.000025,0.000043,0.000081,0.000026,0.000040,0.000040,0.000065,0.000036,0.000044,1.318260,0.539603
3,1119757064,0.000153,5.991253,0.000058,0.000063,0.000229,0.000226,0.000049,0.000578,0.000049,0.008191,0.000077,0.000157,0.000092,0.000054,0.000029,0.423160
4,1144826252,0.024051,0.031716,0.020883,0.014156,31.338269,0.026487,0.052087,0.017033,0.034599,0.126735,0.025253,0.020215,0.099286,0.022525,0.008282,0.497227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808224,190178348963330,1.165444,0.595376,1.328403,0.633391,0.557798,0.720159,1.847466,0.539463,1.169306,2.979752,1.474830,0.579432,1.089522,3.135899,0.708816,1.599988
1808225,190178547427735,1.074241,0.957562,1.223048,0.949185,1.144283,1.200141,2.244458,0.955448,3.209858,2.543565,2.345380,0.931700,2.686489,0.955960,1.157622,1.705453
1808226,190178745892140,0.256665,0.223131,0.582042,0.306856,0.229132,0.275391,0.299442,0.241153,0.241579,1.959724,0.223195,0.248161,0.353318,0.270418,0.227694,0.844889
1808227,190178938089248,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.084542,0.000000,0.000000,0.420271


# Query 

In [31]:
def get_queried_rank(query):
    
    query_list = query.split(" ")
    query_rows = term_vectors_df[term_vectors_df['Terms'].isin(query_list)].drop(columns=['Terms'])

    # =           P(c_j)     * D_j_t1 * D_j_t2 * ... * D_j_tn   /   (Sum_i D_ji)    **    n
    weights = (1 / len(topics)) * query_rows.prod() / (term_vector_totals ** len(query_rows))
    weights = weights / weights.sum()
    
    ts_pagerank = (weights * pr_vectors_df.drop(columns=["ID", "Unbiased"])).sum(axis=1)
    
    return weights, ts_pagerank

In [20]:
def get_ranking(ts_pr):
    ranked_nodes = ts_pr.rename(index=valid_nodes).sort_values(ascending=False)
    ranked_df = pagelookup_df.set_index('ID').loc[ranked_nodes.index]
    
    return ranked_df

In [24]:
ts_pagerank

0          5.372796e-09
1          1.682945e-07
2          3.883714e-10
3          3.060389e-09
4          7.143277e-09
               ...     
1808224    3.090619e-07
1808225    4.612963e-07
1808226    1.493460e-07
1808227    1.742240e-09
1808228    4.687004e-06
Length: 1808229, dtype: float64

In [32]:
weights, ts_pagerank = get_queried_rank("chess")
# weights, ts_pagerank
results = get_ranking(ts_pagerank)
results

Unnamed: 0,ReversedDomain,URL,Title,Description
77324560,org.gmpg,http://gmpg.org/xfn/,XFN,XHTML Friends Network hyperlink protocol for b...
24402307,com.gravatar,http://www.gravatar.com/,Gravatar,Lets weblogs and similar sites display user-pr...
2532360,be.youtu,https://youtu.be/p9jEL_LgAm4,Qu'est-ce qu'un mind map?,"Tony Buzan, l'inventeur de la méthode, présent..."
61412448,gl.goo,https://goo.gl/nTXC3f,"Giambiasi, Ludovic",Consultant formateur en informatique.
42264752,com.statcounter,http://www.statcounter.com/,StatCounter.com,Detailed visitor statistics and a choice of gr...
...,...,...,...,...
21709718,com.exer-fit,http://www.exer-fit.com/,Exer-Fit,Provides resources and expertise for planning ...
36139673,com.par3tours,http://par3tours.com/,The Par 3 Tour LLC,Results and information for events held only o...
36139820,com.para-equestrian,http://www.para-equestrian.com/,Para Equestrian Rider,"Profile of Kathryn Wheelock, her horses, perfo..."
12278458,com.audraerwin,http://audraerwin.com/,Audra Erwin,A certified coach that specializes in fertilit...


In [30]:
for t in cleaned_df.iloc[np.where(cleaned_df["ReversedDomain"]=="org.gmpg")]["Content"].str.split():
    print(t)

['xfn', 'xhtml', 'friends', 'network', 'hyperlink', 'protocol', 'for', 'blogs', 'and', 'blogrolls']


In [22]:
results.head(30)

Unnamed: 0,ReversedDomain,URL,Title,Description
77324560,org.gmpg,http://gmpg.org/xfn/,XFN,XHTML Friends Network hyperlink protocol for b...
2532360,be.youtu,https://youtu.be/p9jEL_LgAm4,Qu'est-ce qu'un mind map?,"Tony Buzan, l'inventeur de la méthode, présent..."
24402307,com.gravatar,http://www.gravatar.com/,Gravatar,Lets weblogs and similar sites display user-pr...
61412448,gl.goo,https://goo.gl/nTXC3f,"Giambiasi, Ludovic",Consultant formateur en informatique.
42264752,com.statcounter,http://www.statcounter.com/,StatCounter.com,Detailed visitor statistics and a choice of gr...
42124715,com.squarespace,http://www.squarespace.com/,"Squarespace, Inc.",Focus on writing weblog entries and publishing...
47938851,com.wufoo,http://wufoo.com/,Wufoo,"Online survey, form and poll building software."
14906471,com.brsgolf,http://www.brsgolf.com/,BRS Golf,Offers software products for golf clubs. Inclu...
43485988,com.technorati,http://www.technorati.com/,Technorati,Real-time search for user-generated media (inc...
78353324,org.networkadvertising,http://www.networkadvertising.org/,Network Advertising Initiative,"""NAI is a group of third party network adverti..."


In [37]:
a = np.where(cleaned_df["ID"] == 21860195) 69309242
a

(array([481025, 481026, 481027, 481028, 481029, 481030, 481031, 481032,
        481033, 481034, 481035, 481036, 481037, 481038, 481039]),)

In [50]:
b = cleaned_df.iloc[481025]["Content"]
b

"['comic', 'kings', 'a', 'virginia', 'beach', 'virginia', 'retailer', 'selling', 'comic', 'books', 'and', 'other', 'pop', 'culture', 'merchandise', 'new', 'hampshire', 'potter', 's', 'guild', 'membership', 'information', 'upcoming', 'workshops', 'and', 'events', 'newsletters', 'and', 'officers', 'and', 'bylaws', 'ventura', 'county', 'potters', 'guild', 'a', 'non-profit', 'organization', 'promoting', 'ceramic', 'art', 'and', 'crafts', 'in', 'the', 'community', 'member', 's', 'gallery', 'and', 'ceramic', 'links', 'california', 'north', 'carolina', 'society', 'of', 'goldsmiths', 'non-profit', 'organization', 'supporting', 'north', 'carolina', 'area', 'metalsmiths', 'zeck', 'dana', 'a', 'weaver', 'in', 'south', 'new', 'jersey', 'includes', 'background', 'upcoming', 'shows', 'and', 'shops', 'where', 'items', 'can', 'be', 'found', 'woodturners', 'of', 'st', 'louis', 'local', 'chapter', 'of', 'the', 'american', 'association', 'of', 'woodturners', 'includes', 'meeting', 'details', 'and', 'sche

In [116]:
# get_queried_rank("affirmative action")
9.444376e-08

0          5.838182e-11
1          5.152582e-09
2          6.753812e-10
3          2.455216e-12
4          1.016845e-09
               ...     
2089095    1.199981e-10
2089096    1.600247e-10
2089097    8.142133e-10
2089098    2.002359e-10
2089099    1.110345e-09
Length: 2089100, dtype: float64

In [72]:
queried_tspr = {}
queries = ["affirmative action", "alcoholism", "amusement parks", "architecture", "bicycling", "blues", 
           "cheese", "citrus groves", "classical guitar", "computer vision", "cruises", 
           "death valley", "field hockey", "gardening", "graphic design", "gulf war", "hiv", 
           "java", "lipari", "lyme disease", "mutual funds", "national parks", 
           "parallel architecture", "recycling cans", "rock climbing", "san francisco", "shakespeare", 
           "stamp collecting", "sushi", "table tennis", "telecommuting", "vintage cars", "volcano", 
           "zen buddhism", "zener"]

In [73]:
for query in queries:
    queried_tspr[query] = get_queried_rank(query)

In [75]:
queried_tspr["alcoholism"]

0          1.928142e-07
1          1.819529e-04
2          2.111412e-05
3          5.692279e-08
4          2.996200e-05
               ...     
2089095    4.029783e-06
2089096    4.772189e-06
2089097    2.444343e-06
2089098    3.068040e-06
2089099    1.139420e-05
Length: 2089100, dtype: float64

In [78]:
test = term_vectors_df[term_vectors_df['Terms'].isin(["test", "me"])]
test

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
1064,test,130,1260,608,32,123,205,0,133,142,1160,500,107,163,74,866
4046,me,438,58,566,105,50,15,4,43,18,293,19,71,139,21,520


In [79]:
test = test.drop(columns=['Terms'])

In [80]:
test

Unnamed: 0,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
1064,130,1260,608,32,123,205,0,133,142,1160,500,107,163,74,866
4046,438,58,566,105,50,15,4,43,18,293,19,71,139,21,520


In [90]:
vecs = test.prod() / (term_vector_totals ** len(query_rows))
vecs

Arts          7.515318e-09
Business      6.176390e-09
Computers     2.187292e-07
Games         2.325670e-08
Health        8.976221e-09
Home          3.685100e-08
News          0.000000e+00
Recreation    3.820641e-09
Reference     6.555416e-09
Regional      1.323161e-09
Science       6.969071e-09
Shopping      8.050861e-09
Society       2.461302e-09
Sports        1.054670e-09
World         1.302338e-09
dtype: float64

In [91]:
1/15 * vecs

Arts          5.010212e-10
Business      4.117593e-10
Computers     1.458195e-08
Games         1.550447e-09
Health        5.984147e-10
Home          2.456733e-09
News          0.000000e+00
Recreation    2.547094e-10
Reference     4.370278e-10
Regional      8.821076e-11
Science       4.646047e-10
Shopping      5.367241e-10
Society       1.640868e-10
Sports        7.031135e-11
World         8.682252e-11
dtype: float64