In [21]:
import numpy as np
import networkx as nx
import torch
from transformers import BertTokenizer, BertModel
from typing import List

import sys
sys.path.append('..')

from tools.BasicUtils import my_read, ugly_normalize, ntopidx

In [2]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [111]:
!grep 'python' ../data/corpus/small_sent.txt > temp.txt
!grep 'language' temp.txt > all_occurance.txt
sent_list = my_read('all_occurance.txt')
len(sent_list)

243

In [100]:
def do_pagerank(sents:List[str], tokenizer:torch.nn.Module, model:torch.nn.Module):
    with torch.no_grad():
        inputs = tokenizer(sents, padding=True, truncation=True, max_length=80, return_tensors='pt')
        output = model(**inputs)
        sent_emb = output.last_hidden_state[:, 0].numpy()
        sent_emb = ugly_normalize(sent_emb)
        sim_matrix = np.matmul(sent_emb, sent_emb.T)
        sim_matrix[sim_matrix < 0.8] = 0
        sim_matrix = (np.identity(len(sim_matrix)) == 0).astype(np.float) * sim_matrix
        g = nx.from_numpy_array(sim_matrix)
        score = nx.pagerank(g)
        temp = sorted(score.items(), key=lambda x: x[1], reverse=True)
        idx = [item[0] for item in temp]
        return [sents[i] for i in idx], [score[i] for i in idx]

In [112]:
sents, score = do_pagerank(sent_list, tokenizer=tokenizer, model=model)

In [113]:
score[:10]

[0.0053760639168282595,
 0.005315701429411255,
 0.005296551157213969,
 0.0052756499582051145,
 0.005273998858462546,
 0.005273755070018193,
 0.005267510772467619,
 0.005247983542732968,
 0.00522351554658776,
 0.0052232561468676915]

In [114]:
score[-5:]

[0.0009729968501772575,
 0.000920422077774054,
 0.0009097013698826598,
 0.0008895110644973206,
 0.0006194507536650837]

In [115]:
sents[:10]

['furthermore, no appropriate som package is available with respect to machine learning standards and in the widely used programming language python.',
 'we also release implementations of the methods in most major programming languages, wolfram language , matlab, r, perl, python, pascal, c++, and haskell, and a free online algorithmic complexity calculator.',
 'netlogo, being the language of choice for a majority of agent - based modeling driven research projects, requires an integration to python for researchers looking to perform statistical analyses of agent - based model output using these libraries.',
 'these abstractions can be expressed concisely and clearly by leveraging the dynamism of the underlying python language and the flexibility of the pytorch library.',
 'the use of python for parallelization is motivated by the fact that the language is well suited for reusing existing serial codes programmed in other languages.',
 'by embedding a domain - specific language within py