In [1]:
from ogb.nodeproppred import NodePropPredDataset
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
tqdm.pandas()

In [2]:
dataset = NodePropPredDataset(name='ogbn-arxiv', root='./arxiv/')

In [3]:
graph, label = dataset[0] # graph: library-agnostic graph object

In [4]:
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

In [5]:
nodelabels = pd.Series(label.flatten(), name="label")

In [6]:
nodeid2paperid = pd.read_csv("arxiv/ogbn_arxiv/mapping/nodeidx2paperid.csv", dtype=str)
nodeid2paperid

Unnamed: 0,node idx,paper id
0,0,9657784
1,1,39886162
2,2,116214155
3,3,121432379
4,4,231147053
...,...,...
169338,169338,3011696425
169339,169339,3011708313
169340,169340,3011798063
169341,169341,3012226457


In [7]:
nodeid2paperid2label = pd.merge(nodeid2paperid, nodelabels, left_index=True, right_index=True)
nodeid2paperid2label

Unnamed: 0,node idx,paper id,label
0,0,9657784,4
1,1,39886162,5
2,2,116214155,28
3,3,121432379,8
4,4,231147053,27
...,...,...,...
169338,169338,3011696425,4
169339,169339,3011708313,24
169340,169340,3011798063,10
169341,169341,3012226457,4


In [8]:
paperid2text = pd.read_csv("arxiv/titleabs.tsv", sep="\t", dtype=str, names=["paper id", "title", "abstract"])
paperid2text

Unnamed: 0,paper id,title,abstract
0,200971,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...
...,...,...,...
179715,3012555423,kernel quantization for efficient network comp...,This paper presents a novel network compressio...
179716,3012556759,generating electronic health records with mult...,Sharing electronic health records (EHRs) on a ...
179717,3012557507,vulnerabilities of connectionist ai applicatio...,This article deals with the IT security of con...
179718,3012557525,cross modal multi task learning for graphic re...,Face recognition of realistic visual images ha...


In [9]:
nodeid2text = pd.merge(nodeid2paperid2label, paperid2text, on="paper id")
nodeid2text = nodeid2text.assign(text=nodeid2text["title"].str.cat(nodeid2text["abstract"], sep=" "))
nodeid2text = nodeid2text[["node idx", "label", "text"]]
nodeid2text

Unnamed: 0,node idx,label,text
0,0,4,evasion attacks against machine learning at te...
1,1,5,how hard is computing parity with noisy commun...
2,2,28,on the absence of the rip in real world applic...
3,3,8,a promise theory perspective on data networks ...
4,4,27,analysis of asymptotically optimal sampling ba...
...,...,...,...
169338,169338,4,sentinet detecting localized universal attacks...
169339,169339,24,interpretable mtl from heterogeneous domains u...
169340,169340,10,learning compositional rules via neural progra...
169341,169341,4,certified defenses for adversarial patches Adv...


In [10]:
nodeid2text_train = nodeid2text.loc[train_idx]
nodeid2text_valid = nodeid2text.loc[valid_idx]
nodeid2text_test = nodeid2text.loc[test_idx]

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(nodeid2text_train["text"])
X_train_counts = vectorizer.transform(nodeid2text_train["text"])

In [12]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train = tf_transformer.transform(X_train_counts)
X_train

<90941x96000 sparse matrix of type '<class 'numpy.float64'>'
	with 8769896 stored elements in Compressed Sparse Row format>

In [13]:
clf = MultinomialNB().fit(X_train, nodeid2text_train["label"])

In [14]:
valid_pred = clf.predict(tf_transformer.transform(vectorizer.transform(nodeid2text_valid["text"])))
test_pred = clf.predict(tf_transformer.transform(vectorizer.transform(nodeid2text_test["text"])))

In [15]:
(valid_pred == nodeid2text_valid["label"]).mean()

0.3182992717876439

In [16]:
(test_pred == nodeid2text_test["label"]).mean()

0.29220418492685635

In [17]:
len(nodeid2text["label"].unique())

40

# Topic Modeling

In [18]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
nodeid2text = nodeid2text.assign(words=nodeid2text["text"].progress_apply(lambda mystr: gensim.utils.simple_preprocess(mystr, deacc=True)))
nodeid2text

  0%|          | 0/169343 [00:00<?, ?it/s]

Unnamed: 0,node idx,label,text,words
0,0,4,evasion attacks against machine learning at te...,"[evasion, attacks, against, machine, learning,..."
1,1,5,how hard is computing parity with noisy commun...,"[how, hard, is, computing, parity, with, noisy..."
2,2,28,on the absence of the rip in real world applic...,"[on, the, absence, of, the, rip, in, real, wor..."
3,3,8,a promise theory perspective on data networks ...,"[promise, theory, perspective, on, data, netwo..."
4,4,27,analysis of asymptotically optimal sampling ba...,"[analysis, of, asymptotically, optimal, sampli..."
...,...,...,...,...
169338,169338,4,sentinet detecting localized universal attacks...,"[sentinet, detecting, localized, universal, at..."
169339,169339,24,interpretable mtl from heterogeneous domains u...,"[interpretable, mtl, from, heterogeneous, doma..."
169340,169340,10,learning compositional rules via neural progra...,"[learning, compositional, rules, via, neural, ..."
169341,169341,4,certified defenses for adversarial patches Adv...,"[certified, defenses, for, adversarial, patche..."


In [20]:
def remove_stopwords(words):
    return [word for word in words if word not in stop_words]

nodeid2text = nodeid2text.assign(words_clean=nodeid2text["words"].progress_apply(remove_stopwords))

  0%|          | 0/169343 [00:00<?, ?it/s]

In [21]:
import gensim.corpora as corpora
id2word = corpora.Dictionary(nodeid2text["words_clean"].iloc[train_idx])

In [22]:
corpus = [id2word.doc2bow(text) for text in nodeid2text.iloc[train_idx]["words_clean"]]

In [23]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10)

In [24]:
lda_model.print_topics()

[(0,
  '0.008*"time" + 0.007*"algorithm" + 0.007*"problem" + 0.006*"model" + 0.005*"results" + 0.005*"show" + 0.005*"based" + 0.004*"optimal" + 0.004*"algorithms" + 0.004*"paper"'),
 (1,
  '0.010*"data" + 0.007*"algorithm" + 0.006*"based" + 0.006*"performance" + 0.005*"method" + 0.005*"sparse" + 0.004*"time" + 0.004*"problem" + 0.004*"algorithms" + 0.004*"learning"'),
 (2,
  '0.017*"data" + 0.008*"learning" + 0.006*"based" + 0.005*"using" + 0.005*"approach" + 0.004*"model" + 0.004*"new" + 0.004*"show" + 0.004*"paper" + 0.004*"performance"'),
 (3,
  '0.007*"based" + 0.007*"system" + 0.006*"model" + 0.006*"paper" + 0.005*"systems" + 0.004*"show" + 0.004*"using" + 0.004*"data" + 0.004*"logic" + 0.004*"language"'),
 (4,
  '0.008*"codes" + 0.007*"network" + 0.006*"data" + 0.005*"paper" + 0.005*"information" + 0.005*"based" + 0.004*"results" + 0.004*"channel" + 0.004*"linear" + 0.004*"relay"'),
 (5,
  '0.009*"time" + 0.009*"graph" + 0.009*"algorithm" + 0.008*"problem" + 0.006*"graphs" + 0.00

In [25]:
gammas, _ = lda_model.inference([id2word.doc2bow(text) for text in nodeid2text["words_clean"]])
gammas.shape

(169343, 10)

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
train_idx

array([     0,      1,      2, ..., 169145, 169148, 169251], dtype=int64)

In [29]:
logistic_clf = LogisticRegression(random_state=0).fit(gammas[train_idx], nodeid2text.iloc[train_idx]["label"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
logistic_clf.score(gammas[valid_idx], nodeid2text.iloc[valid_idx]["label"])

0.433538038189201

In [32]:
logistic_clf.score(gammas[test_idx], nodeid2text.iloc[test_idx]["label"])

0.4130814970269325