In [1]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import math
import numpy as np

In [2]:
def draw_multinomial(num_entities, zeta):
    # 
    draws = np.random.multinomial(1, pvals=zeta, size=num_entities)
    z = np.argmax(draws, axis = 1)
    return z

def draw_interaction(eta):
    return np.random.binomial(1, eta)

def draw_system_parameters(config):
    print('drawing wiki parameters')
    systemParams = np.load(config['load_path'], allow_pickle=True)
    eta = systemParams['intMat']
    zeta = systemParams['freqVec']
    return eta, zeta

def create_dataset(num_entities, interactionMatrix, frequencyVector):
    """Samples 100 start points from frequencyVector
    Samples 100 end points from frequencyVector
    Derives edges between them exhaustively from interactionMatrix
    Shuffles and splits into test and train
    Partitions train according to percentage of data observed."""
    dataMatrix = np.empty((num_entities**2, 3))
    startPoints = draw_multinomial(num_entities, frequencyVector)
    endPoints = draw_multinomial(num_entities, frequencyVector)
    dataMatrix[:, :2] = np.array(np.meshgrid(np.arange(num_entities), np.arange(num_entities))).T.reshape(-1,2) # matrix indexing
    dataMatrix = np.int_(dataMatrix)
    for row_num, (i, j, _) in enumerate(dataMatrix):
        classI = startPoints[i]
        classJ = endPoints[j]
        dataMatrix[row_num, -1] = draw_interaction(interactionMatrix[classI, classJ])
    return dataMatrix

def split_dataset(num_entities, dataMatrix, num_splits=10):
    split_size = np.int((num_entities**2) / num_splits)
    print('working with a split of {} size'.format(split_size))
    incomplete = True
    while incomplete:
        try:
            dataPerm = np.random.permutation(dataMatrix)
            testSet = dataPerm[:split_size]
            firstSet = dataPerm[split_size:(2*split_size)]
            assert testSet.shape[0] == split_size, "testSet not split_size rows: {}".format(testSet.shape[0])
            assert firstSet.shape[0] == split_size, "firstSet not split_size rows: {}".format(firstSet.shape[0])
            testElements = np.unique(testSet[:, :2])
            firstElements = np.unique(firstSet[:, :2])
            assert len(testElements) == num_entities, "not enough test entities"
            assert len(firstElements) == num_entities, "not enough first entities"
            assert (testElements == np.arange(num_entities)).all(), "not all elements included in testSet"
            assert (firstElements == np.arange(num_entities)).all(), "not all elements included in firstSet"
            
        except Exception as e:
            print(e)
            continue
        incomplete = False
        
        return dataPerm, split_size

def save_dataset(dataPerm, config, split_size, num_splits=10):
    upper_ranges = [x*split_size for x in np.arange(2, num_splits+1)]
    trainingSplits = [dataPerm[split_size:u] for u in upper_ranges]
    np.save('{}_test.npy'.format(config['save_path']), dataPerm[:split_size])
    for i, d in enumerate(trainingSplits):
        np.save('{}_train_{}.npy'.format(config['save_path'], i), d)
 


In [3]:
num_entities = 100
loadDir = './WIKIdata/matrices2020'
saveDir = './WIKIdata/domains'
domains = os.listdir(loadDir)
print(domains[:10])
for file in domains:
    filename = file.replace('.npz', '')
    if '.DS' in file:
        continue
    try:
        os.makedirs('{}/{}'.format(saveDir, filename))
    except Exception as e:
        print(e)
        
    loadPath = '{}/{}'.format(loadDir, file)
    savePath = '{}/{}/data'.format(saveDir, filename)
    config = {'num_entities': num_entities, 'save_path': savePath, 'load_path': loadPath}
    
    if os.path.exists('{}_test.npy'.format(savePath)):
        continue
    else:
        eta, zeta = draw_system_parameters(config)
        dM = create_dataset(config['num_entities'], eta, zeta)
        dP, split_size = split_dataset(config['num_entities'], dM)
        save_dataset(dP, config, split_size)

['wikipedia_system_681.npz', 'wikipedia_system_695.npz', 'wikipedia_system_118.npz', 'wikipedia_system_642.npz', 'wikipedia_system_124.npz', 'wikipedia_system_130.npz', 'wikipedia_system_656.npz', 'wikipedia_system_483.npz', 'wikipedia_system_497.npz', 'wikipedia_system_468.npz']
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters

drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki p

drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki p

working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a

drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki p

drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki parameters
working with a split of 1000 size
drawing wiki p

In [4]:
# dM = create_dataset(100, np.random.rand(2,2), np.ones((2))/2)
# dP, split_size = split_dataset(100, dM)
# save_dataset(dP, {'save_path': './WIKIdata/testing'}, split_size)

# # i = np.random.choice(np.arange(9))
# # tM = np.load('./WIKIdata/testing_train_{}.npy'.format(i))
# # print(tM.shape)
# # print(tM[:10])