In [56]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import pdfbox
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preprocessing

In [57]:
# transform pdf to txt
pdfname = "Automated Phrase Mining from Massive Text Corpora.pdf".replace("'", "")
# extract text using PDFBox
temp_txt = pdfname.replace('.pdf', '.txt')
temp_txt = temp_txt.replace(' ', '_')
p = pdfbox.PDFBox()
p.extract_text(pdfname, temp_txt)

textname = pdfname.replace('.pdf', '_converted.txt')
textname = textname.replace(' ', '_')
output_txt = open(textname, 'w')
# concatenate split lines
with open(temp_txt, 'rb') as f:
    for line in f:
        line = line.decode()
        if len(line) >= 2 and line[-2] == '-':
            output_txt.write(line[:-2])
        else:
            output_txt.write(line[:-1] + ' ')
output_txt.close()

In [58]:
# turn input paper to a 1-d array
test_doc = np.array(np.loadtxt("Automated_Phrase_Mining_from_Massive_Text_Corpora_converted.txt", dtype = str, delimiter = "\n"), ndmin = 1)
test_doc[0]

'Automated Phrase Mining from Massive Text Corpora Jingbo Shang , Jialu Liu, Meng Jiang, Xiang Ren, Clare R. Voss, and Jiawei Han, Fellow, IEEE Abstract—As one of the fundamental tasks in text analysis, phrase mining aims at extracting quality phrases from a text corpus and has various downstream applications including information extraction/retrieval, taxonomy construction, and topic modeling. Most existing methods rely on complex, trained linguistic analyzers, and thus likely have unsatisfactory performance on text corpora of new domains and genres without extra but expensive adaption. None of the state-of-the-art models, even data-driven models, is fully automated because they require human experts for designing rules or labeling phrases. In this paper, we propose a novel framework for automated phrase mining, AutoPhrase, which supports any language as long as a general knowledge base (e.g., Wikipedia) in that language is available, while benefiting from, but not requiring, a POS ta

In [59]:
# read knowledge base
cs_docs = list(np.loadtxt("cs.txt", dtype = str, delimiter = "\n"))
cs_docs, len(cs_docs)

(['2D Path Solutions from a Single Layer Excitable CNN Model.',
  'An easily implementable path solution algorithm for 2D spatial problems,',
  'based on excitable/programmable characteristics of a specific cellular',
  'nonlinear network (CNN) model is presented and numerically investigated. The',
  'network is a single layer bioinspired model which was also implemented in CMOS',
  'technology. It exhibits excitable characteristics with regionally bistable',
  'cells. The related response realizes propagations of trigger autowaves, where',
  'the excitable mode can be globally preset and reset. It is shown that, obstacle',
  'distributions in 2D space can also be directly mapped onto the coupled cell',
  'array in the network. Combining these two features, the network model can serve',
  'as the main block in a 2D path computing processor. The related algorithm and',
  'configurations are numerically experimented with circuit level parameters and',
  'performance estimations are also 

In [60]:
# combine input paper and knowledge base
cs_docs.append(test_doc[0])
docs = cs_docs[::-1]
len(docs)

7110512

## TF-IDF on Input Paper

See the idf values of words

In [61]:
#instantiate CountVectorizer() 
cv = CountVectorizer() 
 
# this steps generates word counts for the words in your docs 
word_count_vector = cv.fit_transform(docs)

In [62]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [63]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
idf = df_idf.sort_values(by=['idf_weights'], ascending = False)
idf.head(30)

Unnamed: 0,idf_weights
½0,16.083938
nonrational,16.083938
nonrandomly,16.083938
nonradiation,16.083938
nonquantized,16.083938
nonquantitative,16.083938
nonquantified,16.083938
nonpublic,16.083938
nonproj,16.083938
nonproductive,16.083938


In [64]:
# count matrix 
count_vector=cv.transform(docs) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [65]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df = df.sort_values(by=["tfidf"],ascending=False)
df

Unnamed: 0,tfidf
phrase,0.457768
phrases,0.299412
autophrase,0.216276
the,0.208348
quality,0.207410
...,...
faceqnet,0.000000
facereader,0.000000
facerec,0.000000
facereenactor,0.000000


In [66]:
df.iloc[1:30]

Unnamed: 0,tfidf
phrases,0.299412
autophrase,0.216276
the,0.208348
quality,0.20741
in,0.155732
to,0.155617
phrasal,0.148709
and,0.13973
of,0.139447
pos,0.127983


## TF-IDF on search results

This paper is the first recommendation result using AutoPhrase keywords to search on Semantic Scholar.

Keywords: knowledge bases, high quality, domain experts, 

Field of Study: Computer Science

In [67]:
# transform pdf to txt
pdfname = "Constructing large scale.pdf".replace("'", "")
# extract text using PDFBox
temp_txt = pdfname.replace('.pdf', '.txt')
temp_txt = temp_txt.replace(' ', '_')
p = pdfbox.PDFBox()
p.extract_text(pdfname, temp_txt)

textname = pdfname.replace('.pdf', '_converted.txt')
textname = textname.replace(' ', '_')
output_txt = open(textname, 'w')
# concatenate split lines
with open(temp_txt, 'rb') as f:
    for line in f:
        line = line.decode()
        if len(line) >= 2 and line[-2] == '-':
            output_txt.write(line[:-2])
        else:
            output_txt.write(line[:-1] + ' ')
output_txt.close()

In [68]:
# turn input paper to a 1-d array
test_doc = np.array(np.loadtxt("Constructing_large_scale_converted.txt", dtype = str, delimiter = "\n"), ndmin = 1)
test_doc

array(['Proceedings of the BioNLP 2019 workshop, pages 142–151 Florence, Italy, August 1, 2019. c©2019 Association for Computational Linguistics 142 Constructing large scale biomedical knowledge bases from scratch with rapid annotation of interpretable patterns Julien Fauqueur∗ BenevolentAI 4-8 Maple St, London W1T 5HD julien@benevolent.ai Ashok Thillaisundaram∗ BenevolentAI 4-8 Maple St, London W1T 5HD ashok@benevolent.ai Theodosia Togia∗ BenevolentAI 4-8 Maple St, London W1T 5HD sia@benevolent.ai Abstract Knowledge base construction is crucial for summarising, understanding and inferring relationships between biomedical entities. However, for many practical applications such as drug discovery, the scarcity of relevant facts (e.g. gene X is therapeutic target for disease Y) severely limits a domain expert’s ability to create a usable knowledge base, either directly or by training a relation extraction model. In this paper, we present a simple and effective method of extracting new fac

In [73]:
# read knowledge base
cs_docs = list(np.loadtxt("cs.txt", dtype = str, delimiter = "\n"))
cs_docs, len(cs_docs)

(['2D Path Solutions from a Single Layer Excitable CNN Model.',
  'An easily implementable path solution algorithm for 2D spatial problems,',
  'based on excitable/programmable characteristics of a specific cellular',
  'nonlinear network (CNN) model is presented and numerically investigated. The',
  'network is a single layer bioinspired model which was also implemented in CMOS',
  'technology. It exhibits excitable characteristics with regionally bistable',
  'cells. The related response realizes propagations of trigger autowaves, where',
  'the excitable mode can be globally preset and reset. It is shown that, obstacle',
  'distributions in 2D space can also be directly mapped onto the coupled cell',
  'array in the network. Combining these two features, the network model can serve',
  'as the main block in a 2D path computing processor. The related algorithm and',
  'configurations are numerically experimented with circuit level parameters and',
  'performance estimations are also 

In [74]:
# combine input paper and knowledge base
cs_docs.append(test_doc[0])
docs = cs_docs[::-1]
len(docs)

7110512

In [75]:
#instantiate CountVectorizer() 
cv = CountVectorizer() 
 
# this steps generates word counts for the words in your docs 
word_count_vector = cv.fit_transform(docs)

In [76]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [77]:
# count matrix 
count_vector=cv.transform(docs) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [78]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df = df.sort_values(by=["tfidf"],ascending=False)
df

Unnamed: 0,tfidf
the,0.307823
pairs,0.272334
of,0.203970
and,0.192947
gene,0.187793
...,...
fadel,0.000000
fademl,0.000000
faden,0.000000
fadernets,0.000000


In [80]:
df.iloc[1:50]

Unnamed: 0,tfidf
pairs,0.272334
of,0.20397
and,0.192947
gene,0.187793
simplifications,0.173287
in,0.173263
patterns,0.163008
disease,0.161871
for,0.160146
expert,0.146263
