In [1]:
import os
import random
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from utils import loadWord2Vec, clean_str
from math import log
from sklearn import svm
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
from scipy.spatial.distance import cosine


In [2]:
line_numbers = [30678, 110533, 142793, 72556, 27639, 94748, 18254, 169291, 47676, 160940, 147744, 15127, 78749, 59829, 167317, 73056, 131571, 24531, 118370, 148336, 76346, 165969, 30699, 114240, 32867, 120188, 44423, 106717, 19155, 132223, 12769, 23208, 70517, 75990, 65828, 101146, 27107, 70210, 4574, 89551, 27173, 49976, 149660, 1760, 69609, 49318, 124182, 94595, 123304, 105721, 86350, 159541, 79606, 123028, 107095, 150414, 4310, 64724, 7650, 126763, 135611, 65948, 114278, 94247, 142386, 168140, 128973, 121514, 130405, 34315, 143244, 3719, 61959, 112651, 38171, 148259, 113456, 112232, 73525, 7920, 36853, 116489, 84397, 136277, 123428, 76576, 5151, 56941, 124699, 12533, 18178, 19649, 100381, 78313, 70090, 155329, 135682, 53481, 151208, 130240, 27646, 28675, 79973, 157667, 43100, 53963, 78501, 166603, 151739, 110517, 35089, 132575, 138224, 59489, 90720, 61597, 78225, 10642, 107487, 140943, 126687, 104644, 942, 132278, 85628, 160146, 161788, 39811, 37744, 48316, 985, 152908, 122448, 69097, 16642, 96790, 122168, 524, 74776, 58863, 54978, 62994, 50323, 162333, 96881, 57265, 16134, 20789, 62002, 105838, 152153, 145323, 93092, 116132, 115863, 147098, 123496, 116418, 55670, 120817, 157032, 80543, 158796, 101307, 154457, 151534, 54017, 82708, 117886, 155004, 77893, 15982, 6523, 58206, 103852, 29108, 58625, 94397, 18728, 99859, 62715, 52965, 58689, 121963, 94897, 53898, 81512, 136066, 49889, 149082, 40152, 58837, 131041, 50988, 74057, 27764, 44475, 38533, 138289, 84064, 102196, 109978, 140064, 47479, 90109, 127680, 93724, 49337, 147557, 53527, 128373, 117244, 70030, 83089, 52550, 140977, 114142, 71213, 127228, 25843, 152047, 13642, 39628, 105903, 89399, 70084, 137118, 158140, 53058, 79644, 66067, 107486, 103481, 129807, 87579, 31972, 160747, 92502, 35578, 53687, 111713, 115930, 49946, 102708, 92258, 104793, 18851, 75317, 51949, 78772, 94867, 47699, 27750, 129256, 142778, 165890, 80245, 138370, 162653, 80163, 97246, 84930, 51240, 63005, 73181, 162873, 31275, 52482, 8181, 133766, 16563, 40273, 22473, 120420, 66459, 22057, 110421, 4386, 129445, 7731, 111820, 118364, 19335, 110146, 117208, 84964, 115633, 17821, 57602, 104389, 38097, 32708, 1940, 21765, 66463, 11322, 114730, 141416, 37185, 111174, 151975, 73360, 10505, 161806, 109711, 150952, 126579, 143405, 31718, 159063, 40738, 3444, 89731, 156970, 38965, 120699, 126218, 113026, 48015, 126121, 120351, 86867, 45357, 113806, 56239, 57148, 119563, 160956, 110548, 14329, 141398, 81217, 138079, 159935, 72031, 119773, 34934, 130674, 5133, 51161, 135554, 8974, 156692, 105032, 48168, 117920, 150380, 31060, 121244, 17846, 104319, 106571, 139625, 17689, 59525, 76775, 30207, 13589, 112158, 96661, 128351, 11309, 92158, 12069, 122006, 41350, 117058, 70012, 150684, 19909, 93258, 141485, 137843, 134985, 153276, 84784, 84805, 95727, 24052, 101626, 22299, 8560, 113108, 65544, 71881, 11888, 71135, 24534, 61945, 39243, 90760, 88081, 109415, 2266, 137984, 108391, 100628, 106410, 146810, 31981, 54712, 144855, 162410, 59210, 49629, 21667, 37301, 108070, 63250, 26566, 30367, 64786, 82006, 19291, 23296, 35682, 159070, 20992, 73269, 138294, 119400, 148263, 166258, 68904, 76733, 56974, 143727, 39279, 109316, 164966, 146212, 48376, 132583, 31246, 80483, 166422, 17722, 130626, 110627, 135333, 23544, 156794, 83462, 114633, 59113, 32585, 86595, 109005, 101416, 41735, 43606, 549, 118645, 127733, 119015, 162556, 106641, 88339, 41887, 23722, 93985, 52222, 20946, 69845, 99136, 13277, 42912, 6571, 149813, 166004, 90832, 72292, 100274, 77171, 108849, 31617, 148612, 80328, 66351, 150962, 57683, 29517, 116579, 3499, 54173, 46024, 165345, 98562, 47297, 147219, 44065, 94530, 116856, 35226, 154686, 93235, 21159, 24034, 138048, 75731, 133355, 49697, 58760, 149248, 94908, 25532, 27460, 103466, 105966, 137628, 38619, 60623, 124177, 47753, 41054, 28387, 49755, 38844, 45018, 104682, 43354, 8635, 103029, 75635, 63071, 99741, 69276, 96445, 159806, 168120, 69646, 27017, 5433, 61270, 21658, 67859, 101960, 6852, 8043, 4213, 132016, 166854, 12421, 62468, 116552, 103631, 58902, 31871, 93463, 110023, 53033, 116543, 5008, 53472, 138326, 19934, 14665, 104514, 7684, 43672, 93638, 36960, 22043, 155881, 17349, 114984, 52214, 7774, 56961, 79587, 82866, 136818, 118221, 83351, 90404, 6224, 132913, 94511, 58820, 5838, 128425, 82463, 76730, 50824, 159292, 72199, 91175, 77086, 169227, 140834, 20843, 109400, 73219, 85387, 95050, 80883, 54404, 1868, 68150, 128840, 85706, 82428, 140107, 149630, 6871, 46628, 147416, 126403, 125589, 111473, 118468, 138159, 84762, 26394, 116209, 56213, 154427, 113006, 52360, 54202, 8813, 133122, 133835, 90015, 11491, 34938, 129731, 102046, 161229, 34353, 156672, 131278, 86, 144056, 114050, 5980, 39648, 16151, 54779, 3508, 47576, 91046, 116140, 150372, 47284, 33964, 139780, 97493, 132476, 72709, 80390, 71801, 116465, 79318, 144182, 37531, 150148, 32311, 2829, 95747, 101713, 37683, 146339, 159589, 51438, 95230, 163657, 147892, 114187, 144894, 125855, 50746, 33419, 145894, 57745, 2328, 87991, 28374, 89657, 124441, 104479, 11393, 116221, 35517, 69993, 158190, 57610, 57835, 62503, 61953, 22585, 20808, 59276, 68177, 164469, 88875, 18339, 70305, 167877, 47169, 59916, 142723, 146549, 41083, 35486, 26920, 49192, 132281, 4846, 35383, 155497, 99786, 74699, 32146, 274, 21464, 107257, 89835, 16756, 30799, 93143, 113587, 1127, 118139, 17400, 150591, 44786, 20952, 60259, 163534, 131589, 87967, 124208, 14167, 78446, 138881, 26303, 27760, 44641, 116403, 87921, 57832, 95430, 29952, 16471, 49184, 132119, 108557, 71537, 77242, 76300, 148957, 15392, 81570, 113734, 43444, 131652, 4254, 20864, 57875, 116360, 107956, 52106, 165947, 48615, 80836, 120094, 73012, 50801, 100177, 48212, 61614, 140426, 93418, 159883, 96255, 129280, 2859, 4099, 37981, 80550, 7375, 55920, 43047, 86754, 163668, 99719, 106351, 28487, 63603, 99533, 21327, 32217, 7289, 112590, 391, 135115, 103594, 120258, 133074, 97267, 137679, 56368, 84151, 158710, 159204, 49119, 123764, 97415, 110159, 106172, 167753, 17603, 77021, 8576, 92344, 165212, 45424, 162214, 7588, 113811, 41679, 4401, 51082, 41297, 11770, 148768, 83690, 146905, 127106, 109961, 88908, 25528, 13178, 160222, 92957, 107243, 59179, 75080, 62482, 118435, 132644, 38478, 131584, 108343, 110553, 105993, 71736, 71513, 36810, 138758, 145773, 42537, 5075, 160041, 26738, 61440, 71669, 130603, 144751, 153795, 143055, 73606, 153082, 163789, 91998, 53939, 96922, 135341, 25125, 98910, 150401, 25988, 47515, 34369, 50380, 164094, 275, 69598, 117499, 5200, 39204, 84869, 142496, 152556, 115466, 75752, 66017, 79944, 61009, 80717, 38632, 115446, 37270, 72359, 50060, 146167, 112221, 118904, 128534, 163351, 131247, 147846, 104201, 7972, 89876, 117021, 48315, 109860, 74066, 36909, 110979, 140448, 21280, 59235, 144918, 18120, 124847, 162247, 45961, 100348, 59175, 14079, 139323, 40348, 130168, 32283, 29845, 74328, 144972, 157668, 122277, 166296, 58939, 168814, 102675, 133531, 30870, 109035, 168176, 29466, 66717, 83591, 54120, 162344, 79788, 27248, 113594, 117928, 161569, 71658, 46344, 36540, 7039, 130070, 144759, 156134, 101407, 37281, 129992, 91562, 29195, 101246, 127219, 88106, 153295, 154336, 162031, 150051, 65470, 67594, 22581, 163820, 142078, 110773, 87409, 146180, 58424, 61049, 141686, 155201, 72978, 112239, 110315, 93490, 18665, 164364, 127365, 78934, 120119, 7715, 73584, 135091, 53201, 4527, 107590, 80070, 39682, 118864, 40687, 3739, 73091, 94440, 151414, 127604, 40428, 155272, 142023, 15729, 42530, 147584, 39042, 143064, 136202, 1501, 95524, 115981, 64137, 116411, 84590, 129595, 9652, 25303, 85960, 60079, 162750, 145131, 2272, 4939, 16397, 29686, 74314, 158876, 8756, 58592, 113072, 20771, 28404, 168099, 92508, 123958, 116198, 61573, 143495, 146927, 47557, 158973, 17257, 29789, 25778, 55796, 18377, 154387, 154840, 165105, 118559, 14673, 38040, 99328, 97850, 139335, 3643, 67078, 102739, 23635, 100897, 139110, 66346, 117012, 87820, 978, 10122, 29210, 157615, 123481, 91221, 91902, 58106, 1106, 107031, 4514, 115329, 130905, 26671, 9366, 71799, 103728, 63971, 98477, 122799, 59010, 103570, 158889, 55883, 148383, 18361, 128313, 26697, 160526, 113354, 80400, 90060, 107030, 93580, 33763, 74561, 124777, 5723, 113820, 142670, 13614, 58387, 20621, 56513, 30436, 129516, 86761, 165812, 24597, 102961, 116756, 11922, 45736, 128536, 76307, 26300, 107246, 41583, 141317, 109212, 162922, 23200, 115211, 166664, 59394, 23305, 143114, 141613, 139322, 167474, 118081, 2206, 78879, 23666, 32632, 96356, 820, 115396, 16576, 19933, 26025, 149518, 38396, 73785, 207, 141776, 13565, 140737, 145619, 70401, 153189, 146364, 147848, 165407, 32506, 120666, 19831, 33491, 100868, 103388, 23147, 40287, 153408, 82258, 94023, 39431, 138497, 161444, 112672, 65590, 162260, 58637, 89100, 119063, 108610, 119526, 19850, 58515, 32962, 81943, 46205, 121017, 84479, 15328, 151502, 93077, 89561, 128028, 58821, 149397, 100156, 51112, 107742, 82415, 39493, 34491, 167476, 47519, 130695, 124485, 121262, 84672, 38610, 2228, 139324, 22695, 81, 116725, 90580, 120422, 117598, 166612, 16329, 100063, 63302, 29533, 32950, 61579, 27156, 164452, 157142, 50625, 169038, 99722, 118907, 69759, 168013, 62476, 145957, 22376, 12154, 106367, 101657, 101645, 162446, 40923, 131467, 135721, 30724, 59193, 132623, 161042, 103922, 138068, 143872, 87032, 165032, 77903, 84639, 163652, 143476, 111617, 1321, 51153, 18235, 151203, 133022, 109828, 79336, 141367, 46290, 154869, 26024, 62411, 117836, 156609, 2964, 156647, 94396, 136380, 103382, 162384, 54540, 93832, 44397, 144892, 96611, 32442, 132497, 71167, 15959, 101754, 105350, 107204, 110675, 132316, 80310, 69148, 42616, 87460, 46045, 107292, 149685, 138471, 18776, 95959, 99774, 25176, 62191, 15443, 152511, 165683, 145215, 69378, 696, 26034, 58597, 25078, 95220, 162935, 76359, 139112, 76433, 39892, 81639, 58673, 113043, 69093, 108906, 146710, 157608, 121944, 47067, 17530, 35034, 96853, 18856, 11635, 129401, 134013, 91310, 62240, 77845, 75077, 3240, 38607, 131751, 90603, 168742, 108335, 107365, 52867, 1457, 168879, 141538, 15942, 101018, 164890, 96328, 164339, 26098, 101174, 6453, 29387, 144083, 159026, 17914, 167742, 42087, 165845, 62642, 39976, 11273, 111683, 89967, 90536, 22041, 162349, 163436, 120778, 127815, 27735, 112067, 75903, 148479, 60671, 122034, 106582, 35562, 31306, 163315, 131934, 80037, 150631, 63985, 102464, 8806, 164730, 146766, 48166, 44797, 5139, 18097, 81990, 141095, 25880, 161786, 128797, 133206, 98096, 106265, 46954, 51273, 167630, 57482, 40568, 62013, 128127, 33330, 103283, 135370, 122568, 149224, 150228, 115754, 81557, 157238, 5263, 65208, 13145, 147803, 114059, 20982, 39597, 166277, 119018, 79321, 95570, 60130, 164942, 124523, 34307, 81500, 56953, 167399, 4003, 49285, 133161, 47840, 50425, 142233, 158734, 154722, 46914, 22697, 164004, 148862, 26934, 30188, 95846, 11356, 33299, 149527, 160840, 80202, 54709, 155200, 130359, 109072, 145252, 136177, 15670, 61812, 73719, 41399, 130319, 50997, 124562, 100114, 7635, 25583, 146760, 98577, 106545, 7890, 1069, 57630, 108299, 36027, 81598, 73092, 27620, 96660, 28796, 148091, 107500, 163781, 70458, 46963, 114151]

In [3]:
node_ids=[]
for line_no in line_numbers:
    node_id = line_no - 1
    node_ids.append(node_id)

node_ids.sort()

In [4]:
print(node_ids[:5])

[80, 85, 206, 273, 274]


In [5]:

word_embeddings_dim = 300
word_vector_map = {}

# shulffing
doc_name_list = []
doc_train_list = []
doc_test_list = []

f = open('data/all_labels.txt', 'r')
lines = f.readlines()

# Process only the lines with the given indices
for i, line in enumerate(lines):
    if i in node_ids:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())

f.close()
print(doc_name_list[::500])
print(doc_train_list[::500])
print(doc_test_list[::500])

['80\ttrain\t28', '57601\ttrain\t16', '113455\tval\t2']
['80\ttrain\t28', '80069\ttrain\t24', '162383\ttrain\t27']
['206\ttest\t26']


In [6]:

doc_content_list = []
f = open('data/corpus/clean_1500.txt', 'r')
lines = f.readlines()
for line in lines:
    doc_content_list.append(line.strip())
f.close()
print(doc_content_list[::500])


['late weak markov automata weak bisimilarity distribution based equivalence notion markov automata gained popularity reasonable behavioural equivalence markov automata paper studies strictly notion late weak bisimilarity enjoys valuable properties important subclasses trace distribution equivalence partial information , compositionality preserved distributed intersection two scheduler classes thus still reasonable compositional theory markov automata', 'low delay multi party solution paper , attempt revisit problem multi party practical perspective , design space involved problem believe emphasis low end end delays two parties must , source rate session adapt bandwidth availability congestion present , multi party solution specifically designed achieve objectives entirely peer peer \\( \\) , eliminating cost maintaining servers designed deliver video low end end delays , quality levels available network resources arbitrary network topologies network contrast commonly assumed scenarios

In [7]:
doc_content_list = []
f = open('data/corpus/clean_1500.txt', 'r')
lines = f.readlines()
for line in lines:
    doc_content_list.append(line.strip())
f.close()
# print(doc_content_list)

train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
print(train_ids)
random.shuffle(train_ids)

# partial labeled data
#train_ids = train_ids[:int(0.2 * len(train_ids))]

train_ids_str = '\n'.join(str(index) for index in train_ids)
f = open('data/1500.train.index', 'w')
f.write(train_ids_str)
f.close()

test_ids = []
for test_name in doc_test_list:
    test_id = doc_name_list.index(test_name)
    test_ids.append(test_id)
print(test_ids)
random.shuffle(test_ids)

test_ids_str = '\n'.join(str(index) for index in test_ids)
f = open('data/ind.1500.test.index', 'w')
f.write(test_ids_str)
f.close()

ids = train_ids + test_ids
print(ids)
print(len(ids))

[0, 1, 3, 4, 5, 6, 9, 10, 12, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 29, 30, 32, 33, 35, 36, 37, 38, 40, 41, 42, 43, 44, 47, 50, 51, 52, 54, 55, 58, 59, 60, 62, 63, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 82, 84, 90, 91, 92, 93, 96, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 113, 114, 115, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 140, 141, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 159, 161, 162, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 211, 212, 216, 217, 218, 219, 220, 221, 222, 224, 226, 227, 228, 229, 230, 232, 233, 234, 235, 236, 239, 240, 245, 246, 247, 249, 250, 253, 254, 255, 256, 258, 259, 261, 262, 264, 265, 266, 267, 268, 269, 270, 271, 274, 276, 277, 281, 282, 283, 284, 285, 286, 289, 290, 292, 293, 294, 295

In [8]:
shuffle_doc_name_list = []
shuffle_doc_words_list = []
for id in ids:
    shuffle_doc_name_list.append(doc_name_list[int(id)])
    shuffle_doc_words_list.append(doc_content_list[int(id)])
shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

f = open('data/shuffle_1500.txt', 'w')
f.write(shuffle_doc_name_str[::500])
f.close()

f = open('data/corpus/shuffle_1500.txt', 'w')
f.write(shuffle_doc_words_str)
f.close()
print(shuffle_doc_name_list[::500])

['119525\ttrain\t28', '112157\ttrain\t28', '7773\ttrain\t14']


In [9]:

# build vocab
word_freq = {}
word_set = set()
for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

vocab = list(word_set)
vocab_size = len(vocab)
vocab

['analytical',
 'populations',
 'formulas',
 'lower',
 'fully',
 'suited',
 'added',
 'intuitive',
 'secrecy',
 'pm',
 'functional',
 'final',
 'device',
 'ctl',
 'species',
 'pp',
 'programme',
 'explanations',
 'smooth',
 'minimize',
 'overlay',
 'transactions',
 'riemannian',
 'evaluate',
 'encountered',
 'tagging',
 'public',
 'navigation',
 'comprehension',
 'categories',
 'substantial',
 'scalable',
 'generates',
 'intelligence',
 'incomplete',
 'cyber',
 'weight',
 'justified',
 'dyads',
 'responsibility',
 'lfloor',
 'intra',
 'detection',
 'fourier',
 'viseme',
 'devised',
 'converging',
 'observer',
 'details',
 'disparity',
 'allocated',
 'incentive',
 'slight',
 'due',
 'surprisingly',
 '5x',
 'allows',
 'median',
 'predicting',
 'drift',
 'states',
 'nonconvex',
 'discovered',
 'rotation',
 'illustrate',
 'lifted',
 'cost',
 'smallest',
 'instances',
 'enforce',
 'quantitatively',
 'separable',
 'stochasticity',
 'reducing',
 'interactive',
 'distortions',
 'city',
 'tml',

In [10]:

word_doc_list = {}

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

vocab_str = '\n'.join(vocab)

f = open('data/corpus/vocab_1500.txt', 'w')
f.write(vocab_str)
f.close()

In [11]:
# label list
label_set = set()
for doc_meta in shuffle_doc_name_list:
    temp = doc_meta.split('\t')
    label_set.add(temp[2])
label_list = list(label_set)

label_list_str = '\n'.join(label_list)
f = open('data/corpus/labels_1500.txt', 'w')
f.write(label_list_str)
f.close()

In [12]:
train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size  # - int(0.5 * train_size)
# different training rates

real_train_doc_names = shuffle_doc_name_list[:real_train_size]
real_train_doc_names_str = '\n'.join(real_train_doc_names)

f = open('data/real_train_1500.name', 'w')
f.write(real_train_doc_names_str)
f.close()

In [13]:

row_x = []
col_x = []
data_x = []
for i in range(real_train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_x.append(i)
        col_x.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_x.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

# x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32)
x = sp.csr_matrix((data_x, (row_x, col_x)), shape=(
    real_train_size, word_embeddings_dim))

y = []
for i in range(real_train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    y.append(one_hot)
y = np.array(y)
print(y)

# tx: feature vectors of test docs, no initial features
test_size = len(test_ids)

row_tx = []
col_tx = []
data_tx = []
for i in range(test_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i + train_size]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_tx.append(i)
        col_tx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_tx.append(doc_vec[j] / doc_len)  # doc_vec[j] / doc_len

# tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32)
tx = sp.csr_matrix((data_tx, (row_tx, col_tx)),
                   shape=(test_size, word_embeddings_dim))

ty = []
for i in range(test_size):
    doc_meta = shuffle_doc_name_list[i + train_size]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ty.append(one_hot)
ty = np.array(ty)
print(ty)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
word_vectors = np.random.uniform(-0.01, 0.01,
                                 (vocab_size, word_embeddings_dim))

for i in range(len(vocab)):
    word = vocab[i]
    if word in word_vector_map:
        vector = word_vector_map[word]
        word_vectors[i] = vector

row_allx = []
col_allx = []
data_allx = []

for i in range(train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_allx.append(int(i))
        col_allx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_allx.append(doc_vec[j] / doc_len)  # doc_vec[j]/doc_len
for i in range(vocab_size):
    for j in range(word_embeddings_dim):
        row_allx.append(int(i + train_size))
        col_allx.append(j)
        data_allx.append(word_vectors.item((i, j)))


row_allx = np.array(row_allx)
col_allx = np.array(col_allx)
data_allx = np.array(data_allx)

allx = sp.csr_matrix(
    (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embeddings_dim))

ally = []
for i in range(train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ally.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(label_list))]
    ally.append(one_hot)

ally = np.array(ally)

print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)



(944, 300) (944, 38) (228, 300) (228, 38) (5780, 300) (5780, 38)


In [15]:

# allx: the the feature vectors of both labeled and unlabeled training instances
# (a superset of x)
# unlabeled training instances -> words

'''
Doc word heterogeneous graph
'''

# word co-occurence with context windows
window_size = 20
windows = []

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        # print(length, length - window_size + 1)
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)
            # print(window)


word_window_freq = {}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])

word_pair_count = {}
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

row = []
col = []
weight = []

# pmi as weights

num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) /
              (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)
    col.append(train_size + j)
    weight.append(pmi)

In [16]:




# word vector cosine similarity as weights

'''
for i in range(vocab_size):
    for j in range(vocab_size):
        if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
            vector_i = np.array(word_vector_map[vocab[i]])
            vector_j = np.array(word_vector_map[vocab[j]])
            similarity = 1.0 - cosine(vector_i, vector_j)
            if similarity > 0.9:
                print(vocab[i], vocab[j], similarity)
                row.append(train_size + i)
                col.append(train_size + j)
                weight.append(similarity)
'''
# doc word frequency
doc_word_freq = {}

for doc_id in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[doc_id]
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffle_doc_words_list) /
                  word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))


In [17]:

# dump objects
f = open("data/ind.x", 'wb')
pkl.dump(x, f)
f.close()

f = open("data/ind.y", 'wb')
pkl.dump(y, f)
f.close()

f = open("data/ind.tx", 'wb')
pkl.dump(tx, f)
f.close()

f = open("data/ind.ty", 'wb')
pkl.dump(ty, f)
f.close()

f = open("data/ind.allx", 'wb')
pkl.dump(allx, f)
f.close()

f = open("data/ind.ally", 'wb')
pkl.dump(ally, f)
f.close()

f = open("data/ind.adj", 'wb')
pkl.dump(adj, f)
f.close()