In [1]:
import os
import numpy as np
import pickle
from collections import Counter

# This script downloads and reformats the wikipedia link dataset found at: http://snap.stanford.edu/data/index.html

Run third

1. Iterate through edges, and add interaction between all of both endpoints' categories (if they were previously included).
2. Reduce this to a count per target category linked to.

### Input
`wiki-topcats.txt`
This file notes hyperlinks between page indicies. Each line is a single string that takes the following format: `301 606`, which indicates a hyperlink (edge) between page 301 and page 606. 

`categories_to_names2020`
Dictionary that maps each new category index to category names.

`pages_to_categories2020.p`
Dictionary that maps each page idx to included category idxs, reindexed as in generate_domains.ipynb


### Output
`categories_to_categories2020.p`
Dict that contains the hyperlink information, abstracted to category level. Key is category index (start of edge), 
value is dictionary of category index: number of hyperlinks (end of edges).


In [2]:
wDir = '.' # the top level working directory
dataDir = '{0}/WIKIdata'.format(wDir) # where to find and store the data. 
# Unlink from git if necessary by adding to .gitignore in wDir; large files.

os.listdir(dataDir)

['pages_to_categories2020.p',
 'wiki-topcats.txt',
 'wiki-topcats-categories.txt',
 '.DS_Store',
 'numDomainsPerCatRaw.npy',
 'categories_to_domains2020.p',
 'matrices2020',
 'categories_to_names2020.p',
 'wiki-topcats-page-names.txt',
 'categories_to_categories2020.p',
 'domains_to_names2020.p',
 'domains2020.npy',
 'figs2020']

In [3]:
linkFile = '{0}/wiki-topcats.txt'.format(dataDir)

with open('{0}/pages_to_categories2020.p'.format(dataDir), 'rb') as pfile:
    pagesCatsMappings = pickle.load(pfile)

# dictionary that maps (filtered) category indices to names
with open('{0}/categories_to_names2020.p'.format(dataDir), 'rb') as pfile:
    categoryNames = pickle.load(pfile)

# kept category (re)indices
categoryIdxs = np.sort(np.int_(list(categoryNames.keys())))
assert (categoryIdxs == np.arange(len(categoryIdxs))).all()

r = 1791488 # number of nodes (see SNAP website or count as below)

In [4]:
# link dict will index a category to a list of the categories containing hyperlinks to it; non-unique
linkDict = {}
for category in categoryIdxs:
    linkDict[category] = []
validEdges = 0

# iterate over all hyperlinks
with open(linkFile, 'r') as f:
    for i, line in enumerate(f):
        # for monitoring, print every 1,000,000 pages
        if i % 1000000 == 0:
            print('at line: {0}'.format(i))

        # split edge into endpoints
        edge = line.split(' ')
        edge[0] = np.int(edge[0])
        edge[-1] = np.int(edge[-1].replace('\n', ''))
        try: # if both nodes are in retained pages
            pagesCatsMappings[edge[0]] # category one
            pagesCatsMappings[edge[1]] # category two
            
            targetCats = [x for x in pagesCatsMappings[edge[1]]] # all targets (as directed)
            baseCats = [x for x in pagesCatsMappings[edge[0]]] # all bases
            for bCat in baseCats:
                for tCat in targetCats:
                    linkDict[bCat].append(tCat) # iterate through both lists and add edge appropriately
            validEdges += 1 
        except Exception as e:
            continue

at line: 0
at line: 1000000
at line: 2000000
at line: 3000000
at line: 4000000
at line: 5000000
at line: 6000000
at line: 7000000
at line: 8000000
at line: 9000000
at line: 10000000
at line: 11000000
at line: 12000000
at line: 13000000
at line: 14000000
at line: 15000000
at line: 16000000
at line: 17000000
at line: 18000000
at line: 19000000
at line: 20000000
at line: 21000000
at line: 22000000
at line: 23000000
at line: 24000000
at line: 25000000
at line: 26000000
at line: 27000000
at line: 28000000


In [5]:
linkDictCounter = {}
for k, v in linkDict.items():
    v2 = Counter(v)
    linkDictCounter[k] = v2
    
# saves the dict containing category idx (key) to {category index: hyperlink count}
with open('{0}/categories_to_categories2020.p'.format(dataDir), 'wb') as pfile:
    pickle.dump(linkDictCounter, pfile)        

[950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0, 0, 950, 950, 0,

[28, 1333, 1360, 92, 92, 117, 92, 92, 92, 92, 117, 93, 117, 92, 117, 92, 92, 92, 117, 1468, 1469, 1468, 1469, 1468, 93, 85, 85, 85, 92, 92, 92, 92, 92, 92, 117, 92, 92, 92, 117, 117, 648, 92, 92, 117, 92, 92, 1516, 92, 117, 117, 92, 92, 92, 117, 92, 1518, 117, 117, 117, 92, 117, 92, 117, 92, 117, 92, 1516, 92, 92, 92, 117, 117, 92, 117, 92, 117, 92, 117, 92, 117, 92, 117, 92, 117, 117, 1519, 92, 92, 117, 117, 441, 1316, 1080, 1099, 94, 692, 85, 582, 85, 85, 92, 92, 117, 92, 117, 65, 92, 117, 92, 94, 117, 92, 117, 117, 94, 734, 1468, 734, 735, 692, 120, 92, 1516, 1516, 692, 117, 1080, 852, 1516, 1333, 18, 65, 93, 31, 85, 85, 28, 65, 85, 85, 582, 85, 30, 85, 85, 85, 30, 85, 92, 92, 92, 117, 92, 92, 65, 92, 92, 117, 92, 92, 1516, 92, 92, 92, 92, 1516, 92, 92, 94, 92, 117, 92, 930, 93, 92, 94, 96, 1704, 1516, 92, 95, 92, 96, 561, 1519, 95, 92, 117, 117, 118, 117, 734, 1468, 1468, 1469, 734, 1468, 804, 94, 120, 692, 1516, 92, 1516, 117, 734, 1468, 1469, 1618, 1468, 82, 82, 85, 852, 85, 82, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Counter({449: 597, 797: 316, 1589: 203, 491: 110, 505: 105, 507: 105, 734: 67, 1136: 63, 1468: 58, 375: 56, 1919: 56, 1649: 56, 796: 50, 1776: 31, 220: 24, 450: 23, 1317: 14, 1774: 12, 287: 12, 426: 12, 563: 8, 452: 8, 468: 5, 1316: 5, 1652: 5, 466: 5, 568: 5, 428: 5, 430: 5, 878: 4, 579: 4, 420: 4, 1651: 4, 1778: 4, 1779: 3, 1519: 3, 376: 3, 1756: 3, 564: 3, 1622: 3, 1309: 3, 567: 2, 1650: 2, 1773: 2, 490: 2, 1953: 2, 1660: 2, 2065: 2, 1655: 2, 616: 2, 916: 2, 1768: 2, 1044: 2, 497: 1, 582: 1, 580: 1, 1451: 1, 1680: 1, 561: 1, 164: 1, 784: 1, 870: 1, 1754: 1, 1757: 1, 1517: 1, 1366: 1, 1559: 1, 1524: 1, 1162: 1, 134: 1, 256: 1, 1460: 1, 97: 1, 1941: 1, 917: 1, 1659: 1, 1105: 1, 1165: 1, 1784: 1, 596: 1, 1708: 1, 1526: 1, 33: 1, 1811: 1, 959: 1, 966: 1, 1205: 1, 1450: 1, 1787: 1, 1777: 1, 986: 1, 365: 1, 1291: 1, 1292: 1, 1428: 1, 1469: 1, 1138: 1, 1626: 1, 987: 1})
[220, 450, 797, 505, 507, 797, 796, 797, 450, 450, 450, 420, 616, 450, 450, 450, 420, 1776, 1777, 1779, 1780, 375, 220, 

[546, 546, 545, 546, 544, 546, 547, 547, 547, 547, 534, 828, 546, 546, 546, 546, 547, 547, 538, 546, 546, 546, 547, 547, 547, 534, 828, 546, 546, 547, 546, 547, 546, 546, 545, 546, 546, 546, 546, 544, 546, 547, 547, 547, 547, 534, 828, 546, 546, 546, 546, 547, 546, 547, 533, 546, 546, 547, 546, 546, 546, 546, 544, 546, 547, 547, 547, 546, 547, 534, 828, 546, 448, 734, 530, 534, 828, 546, 546, 1647, 919, 448, 542, 546, 448, 380, 533, 546, 534, 828, 534, 919, 546, 547, 546, 546, 547, 545, 546, 546, 546, 546, 544, 546, 547, 547, 547, 547, 546, 547, 534, 828, 546, 546, 547, 544, 546, 546, 547, 547, 546, 546, 546, 547, 530, 546, 547, 546, 547, 547, 546, 547, 534, 828, 546, 544, 546, 546, 547, 546, 546, 547, 546, 547, 546, 530, 1652, 546, 547, 546, 547, 544, 546, 546, 547, 534, 828, 546, 546, 546, 546, 546, 547, 546, 546, 547, 547, 547, 545, 547, 547, 547, 546, 547, 534, 828, 546, 547, 546, 544, 546, 546, 547, 547, 546, 546, 530, 547, 547, 546, 547, 534, 828, 546, 546, 546, 546, 546, 547, 54

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Counter({649: 1059, 220: 390, 115: 379, 650: 360, 1526: 223, 7: 160, 20: 159, 13: 138, 1704: 108, 797: 102, 60: 93, 116: 89, 12: 86, 33: 86, 376: 74, 1468: 74, 796: 69, 687: 64, 79: 62, 780: 60, 94: 59, 1030: 58, 59: 57, 1461: 55, 6: 54, 988: 52, 661: 51, 177: 48, 715: 46, 1729: 46, 586: 45, 167: 42, 34: 42, 62: 41, 782: 39, 779: 38, 1667: 37, 9: 36, 734: 36, 164: 36, 2: 35, 684: 35, 5: 34, 1042: 34, 10: 33, 1469: 33, 38: 31, 651: 31, 85: 30, 662: 30, 166: 30, 1529: 30, 1034: 28, 170: 28, 1524: 28, 15: 27, 1707: 27, 856: 26, 721: 26, 1080: 25, 14: 24, 648: 23, 1043: 23, 1031: 22, 1525: 22, 1260: 21, 96: 21, 182: 21, 185: 21, 1332: 20, 1779: 20, 168: 20, 8: 19, 2053: 19, 66: 18, 16: 18, 178: 18, 774: 18, 11: 17, 49: 17, 253: 17, 924: 16, 21: 15, 854: 15, 19: 14, 660: 14, 1690: 14, 1928: 14, 1911: 13, 2043: 13, 43: 13, 1770: 13, 183: 13, 679: 13, 1196: 13, 505: 13, 795: 13, 1521: 12, 1335: 12, 80: 12, 84: 12, 813: 12, 17: 12, 771: 12, 720: 12, 588: 12, 265: 12, 668: 12, 52: 11, 735: 11,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Counter({856: 324, 220: 308, 854: 265, 855: 200, 387: 191, 1912: 89, 1911: 68, 376: 64, 1375: 58, 1377: 51, 167: 50, 782: 48, 1334: 45, 178: 44, 173: 43, 1667: 42, 1461: 42, 1521: 41, 185: 41, 649: 37, 1369: 33, 780: 32, 844: 31, 1389: 29, 1388: 27, 166: 26, 1164: 25, 1468: 25, 408: 25, 180: 25, 1922: 25, 390: 24, 1376: 24, 192: 23, 34: 22, 391: 22, 182: 21, 1928: 20, 165: 19, 867: 18, 164: 18, 1260: 18, 179: 16, 175: 16, 174: 16, 1366: 16, 1526: 15, 991: 15, 981: 15, 183: 15, 621: 15, 779: 15, 850: 15, 188: 14, 171: 14, 866: 14, 840: 14, 988: 14, 96: 13, 65: 13, 389: 12, 170: 12, 1469: 12, 838: 12, 1558: 12, 650: 12, 583: 12, 1525: 11, 393: 11, 177: 11, 1934: 11, 848: 11, 1910: 11, 184: 10, 617: 10, 789: 10, 1335: 10, 849: 10, 14: 9, 797: 9, 1332: 9, 1339: 9, 172: 9, 304: 9, 1923: 9, 909: 9, 960: 9, 115: 9, 1196: 9, 2032: 9, 1080: 9, 908: 9, 1524: 8, 85: 8, 1527: 8, 1337: 8, 189: 8, 168: 8, 187: 8, 620: 8, 1350: 8, 1197: 8, 7: 8, 94: 8, 15: 7, 1920: 7, 1652: 7, 190: 7, 1361: 7, 784: 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Counter({220: 461, 1291: 386, 1292: 255, 1468: 253, 1642: 159, 1643: 122, 1469: 101, 734: 90, 1288: 66, 986: 65, 1299: 62, 992: 44, 1461: 42, 1290: 33, 782: 30, 88: 29, 1444: 28, 1297: 26, 1286: 25, 376: 24, 579: 23, 987: 22, 1791: 22, 182: 22, 580: 21, 618: 21, 616: 20, 157: 20, 622: 20, 788: 20, 1657: 20, 1264: 20, 1135: 19, 183: 19, 1296: 19, 1647: 18, 1452: 18, 334: 17, 402: 17, 1919: 17, 1287: 17, 323: 16, 639: 16, 184: 16, 787: 16, 195: 16, 779: 16, 1294: 16, 1300: 16, 1835: 15, 1741: 15, 304: 15, 1298: 15, 1274: 15, 1284: 14, 391: 14, 1315: 14, 784: 14, 1295: 14, 2008: 14, 164: 13, 1652: 13, 721: 13, 1934: 13, 1205: 12, 145: 12, 1837: 12, 989: 12, 260: 12, 903: 12, 276: 11, 148: 11, 1838: 11, 1779: 11, 1997: 11, 1942: 11, 1305: 11, 1430: 11, 1989: 11, 1937: 10, 1260: 10, 1455: 10, 174: 10, 110: 10, 329: 10, 192: 10, 248: 10, 1462: 10, 1474: 10, 259: 10, 1267: 10, 1559: 9, 1457: 9, 638: 9, 880: 9, 783: 9, 258: 9, 652: 9, 864: 9, 1640: 9, 1167: 9, 1946: 9, 1282: 9, 1311: 9, 792: 

[87, 87, 85, 1167, 87, 87, 87, 1132, 87, 220, 1132, 1309, 469, 87, 1291, 1299, 158, 1451, 579, 580, 1453, 157, 992, 734, 1468, 388, 395, 426, 734, 1468, 1468, 1468, 220, 30, 1338, 87, 391, 376, 1339, 1461, 1543, 1334, 1364, 728, 148, 1339, 1350, 1461, 220, 376, 1461, 1366, 878, 1291, 1292, 365, 1659, 377, 473, 878, 1736, 167, 174, 840, 1776, 445, 643, 1779, 1164, 1978, 1981, 845, 380, 1650, 1645, 1136, 426, 427, 97, 420, 632, 1891, 1316, 1316, 1316, 734, 1468, 185, 220, 989, 323, 616, 618, 976, 426, 1651, 734, 1468, 563, 1654, 1891, 426, 734, 1468, 852, 427, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1288, 1291, 1316, 1171, 1654, 1292, 1291, 507, 220, 1638, 986, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1317, 1288, 1291, 1316, 1654, 734, 1468, 1638, 2065, 1291, 1317, 429, 429, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1654, 1317, 1316, 1171, 1648, 734, 1468, 1638, 2065, 1291, 986, 1468, 1654, 1654, 1654, 1654, 1654, 1654,

[649, 649, 94, 146, 220, 897, 1747, 220, 1747, 220, 220, 924, 220, 220, 167, 734, 1468, 220, 924, 1340, 95, 162, 1346, 1340, 1346, 1346, 220, 1461, 436, 1525, 890, 1770, 1770, 649, 23, 1349, 220, 1747, 220, 1747, 220, 1747, 220, 220, 220, 1781, 122, 162, 639, 185, 780, 260, 701, 734, 1468, 220, 220, 1643, 220, 896, 1770, 1777, 734, 1468, 246, 1099, 1328, 1680, 649, 220, 1747, 926, 924, 649, 162, 890, 734, 1468, 448, 734, 1704, 734, 1468, 1291, 1346, 220, 1525, 649, 924, 220, 220, 220, 220, 650, 649, 177, 220, 1746, 734, 1468, 1776, 1776, 1779, 567, 220, 1745, 1746, 1778, 1779, 1778, 1779, 1779, 1780, 1776, 1779, 1776, 1779, 1777, 1779, 1780, 1780, 162, 924, 296, 639, 436, 220, 1747, 1774, 94, 162, 639, 447, 1746, 1773, 1776, 1774, 1776, 1779, 220, 1747, 890, 623, 734, 1468, 890, 220, 33, 116, 13, 12, 2, 72, 1332, 72, 220, 1332, 40, 408, 964, 22, 80, 582, 858, 2053, 162, 924, 1286, 1291, 162, 868, 734, 1468, 220, 650, 220, 157, 1747, 246, 268, 28, 1326, 162, 2053, 220, 276, 1394, 1737, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

