In [None]:
def get_files():
    #Edgelist of All Connected Nodes
    topcats = open('wiki-topcats.txt', 'r')

    #Page Names of Every Node
    topcats_page_names = open('wiki-topcats-page-names.txt', 'r')

    #Categories & All Pages Within Each Category
    topcats_categories = open('wiki-topcats-categories.txt', 'r')
    
    return topcats, topcats_page_names, topcats_categories

def load_data(topcats):
    data = np.loadtxt(topcats, dtype=int)
    return data

#Function to get all category names to a list
def all_categories(cat_list, lines):
    for line in lines:
        line = line.strip()
        line = line.split(' ')
        category = line[0]
        cat_list.append(category)
        
    return cat_list

#Function to convert categories (key) & pages in each category (value) into a dictionary
def lines_to_dict(dictionary, lines):
    for line in lines:
        line = line.strip()
        line = line.split(' ')
        key = line[0]
        values = line[1:]
        int_values = [eval(i) for i in values]
        dictionary[key] = int_values
        
    return dictionary

def make_catlist_catdict(topcats_categories):
    category_lines = topcats_categories.readlines()
    cat_list = []
    cat_list = all_categories(cat_list, category_lines)

    cat_dict = {}
    cat_dict = lines_to_dict(cat_dict, category_lines)
    
    return cat_list, cat_dict

#Only necessary if you want to see the page names/articles
def get_pages(topcats_page_names):
    page_lines = topcats_page_names.readlines()

    pages = {}

    for line in page_lines:
        line = line.strip()
        line = line.split(' ')
        key = int(line[0])
        values = line[1:]
        str_values = ' '.join(values)
        pages[key] = str_values
        
    return pages

def make_digraph(data):
    DG = nx.DiGraph()
    DG.add_edges_from(data)
    
    return DG

#Returns two random categories
def get_rand_cats(cat_list):
    cat_inds = []
    start = 0
    end = len(cat_list)

    for i in range(2):
        num = random.randint(start, end)
        cat_inds.append(num)
    
    print(cat_inds)    
    return cat_inds

#Returns Array of Page Numbers from Category Dictionary 
def get_cat_pages(category, cat_dict):
    return cat_dict.get(category)

def checks(data, cat_list, DG):
    if len(data) == 28511807:
        print('Data Transferred Correctly\n')
    else:
        print('Data Transferred Incorrectly\n')
        return False
    
    if len(cat_list) == 17364:
        print('Category List Transferred Correctly\n')
    else:
        print('Category List Transferred Incorrectly\n')
        return False
    
    if DG.number_of_nodes() == 1791489:
        print('DiGraph Made Correctly\n')
    else:
        print('DiGraph Made Incorrectly\n')
        return False
    
    print('All Check Passed')
    return True

In [None]:
#Start Up - will take a few minutes

#Imports
import pandas as pd
import numpy as np
import networkx as nx
import random

#Files
topcats, topcats_page_names, topcats_categories = get_files()
#Data/Edge List
data = load_data(topcats)
#Category Name List & Category Page Dictionary
cat_list, cat_dict = make_catlist_catdict(topcats_categories)
#Make Graph
DG = make_digraph(data)
#Checks
checks(data, cat_list, DG)

In [None]:
#Path Functions

#Returns length of the shortest path between Node 1 and Node 2
def shortest_path_nodes(DG, node1, node2):
    path = nx.shortest_path(DG, node1, node2)
    return len(path)

#Returns Average Shortest Path Length of One Page from the First Category to Every Page From Another Category
def avg_shortest_path_nodes(DG, cat1_page, cat2):
    paths = []
    pages_in_cat2 = cat_dict.get(cat_list[cat2])
    
    for cat2_page in pages_in_cat2:
        path = shortest_path_nodes(DG, cat1_page, cat2_page)
        paths.append(path)
    
    np_paths = np.array(paths)
    avg_path_length = np.average(np_paths)
    
    return avg_path_length

#Returns the Average Shortest Path Length Between Two Categories
def avg_shortest_path_cats(DG, cat1, cat2):
    avg_paths = []
    pages_in_cat1 = cat_dict.get(cat_list[cat1])
    
    for cat1_page in pages_in_cat1:
        avg_path = avg_shortest_path_nodes(DG, cat1_page, cat2)
        avg_paths.append(avg_path)
    
    np_avg_paths = np.array(avg_paths)
    avg_path_cats = np.average(np_avg_paths)
    
    return avg_path_cats

In [None]:
def find_correlations(DG, cat1, cat_list):
    correlations = {}
    
    for i in range(len(cat_list)):
        cat2 = i
        #Does no compare against itself
        if cat1 == cat2:
            continue
        
        key_str = 'Category 1: ' + cat_list[cat1] + '-> Category 2: ' + cat_list[cat2]
        
        #Dictionary
        #every key is the input category, and then the category it is being compared to
        #the values is the avgerage shortest path between the two categories
        key = key_str
        values = avg_shortest_path_cats(DG, cat1, cat2)
        correlations[key] = values
    
    return correlations

In [None]:
cats = get_rand_cats(cat_list)

cat1 = cats[0]
cat2 = cats[1]

print(f'{cat1}:{cat_list[cat1]}\nPages:{cat_dict.get(cat_list[cat1])}')
print()
print(f'{cat2}:{cat_list[cat2]}\nPages:{cat_dict.get(cat_list[cat2])}')

In [None]:
# correlations = find_correlations(DG, cat1, cat_list)
# correlations

In [None]:
#Min Correlation = Largset Number, meaning largest average shortest path between two categories
#Max Correlation = Smallest Number, meaning smallest avergae shortest path between two categories

def min_max_correlation(DG, cat1, cat_list):
    correlations = find_correlations(DG, cat1, cat_list)
    
    #Initial Values
    #anything larger than 0 will replcae this number
    min_corr = 0 
    #anything smaller than 10 will replace this number, 10 chosen because the longest shortest path in the whole
    #dataset is 9
    max_corr = 10 
    
    for key in correlations:
        if correlations[key] > min_corr:
            min_key = key
            min_corr = correlations[key]
        
        elif correlations[key] < max_corr:
            max_key = key
            max_corr = correlations[key]
            
    print(f'Minimum Correlated Categories: {min_key}\nAverage Shortest Path Between Categories: {correlations[min_key]}')
    print(f'Maximum Correlated Categories: {max_key}\nAverage Shortest Path Between Categories: {correlations[max_key]}')

In [None]:
def every_cat_correlations(DG, cat_list):
    
    #Every i becomes a number, number represents index of cat_list, cat1 becomes that number
    for i in range(len(cat_list)):
        cat1 = i
        
 