In [1]:
import os
import glob
import pickle
from tqdm import tqdm
import random
import pickle

from collections import Counter
import pandas as pd

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import coo_matrix

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from common import Paper

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def get_papers():
    concat = []

    for f in tqdm(glob.glob('./papers/*.pkl')):
        _, field, year, month = os.path.basename(f).split('.')[0].split('-')
        raw = pickle.load(open(f, 'rb'))
        concat += [x.__dict__ for x in raw]

    return pd.DataFrame(concat)

## Global definitions

In [3]:
wnl = WordNetLemmatizer()
ps = PorterStemmer()

stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(["using", "show", "result", "large", 
                               "also", "iv", "one", "two", "new", 
                               "previously", "shown", "cite", "work", "other"
                               "however", "thus", "therefore", "while", "whilst", "continues"])

papers = get_papers()

100%|██████████| 540/540 [00:00<00:00, 807.75it/s]


## Clean dataset and construct corpus

#### Ensure $\geq$ N words in each abstract

In [4]:
MIN_WORDS = 20

papers['word_count'] = papers.loc[:, 'abstract'].apply(lambda x: len(str(x).split(" ")))
papers = papers[papers['word_count'] >= MIN_WORDS]

papers.word_count.describe()

count    110909.000000
mean        158.822999
std          50.537725
min          20.000000
25%         123.000000
50%         155.000000
75%         191.000000
max         559.000000
Name: word_count, dtype: float64

In [5]:
uncommon = pd.Series(' '.join(papers['abstract']).split()).value_counts()[-20:]
# uncommon

In [6]:
def construct_corpus(dataset):
    corpus_map = {}
    
    for i in tqdm(range(len(dataset))):
        d = dataset.iloc[i]
        
        text = f'{d.title} {d.abstract}'
        text = re.sub('[^a-zA-Z-]', ' ', text)
        text = text.lower()
        text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ",text)

        text = ' '.join([wnl.lemmatize(w) for w in text.split() if not w in stop_words])
        
        corpus_map[d.pdf] = text
        
    return list(corpus_map.values()), corpus_map

corpus, corpus_map = construct_corpus(papers)

100%|██████████| 110909/110909 [00:57<00:00, 1937.43it/s]


#### Wordcloud

In [7]:
# %%time

# wordcloud = WordCloud(background_color='white',
#                       stopwords=stop_words,
#                       max_words=100,
#                       max_font_size=50).generate(str(corpus))

# fig = plt.figure(figsize=(3, 2), dpi=250)
# plt.imshow(wordcloud, interpolation='lanczos')
# plt.axis('off')
# pass

#### Vectorizers

In [8]:
%%time

cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10_000, ngram_range=(1,2))
X = cv.fit_transform(corpus)

tf_idf = TfidfTransformer(smooth_idf=True, use_idf=True)
tf_idf.fit(X)

features = cv.get_feature_names()

pickle.dump([cv, tf_idf, features], open('vec/cv_tfidf_feat.pkl', 'wb'))
pickle.dump(corpus_map, open('vec/corpus_map.pkl', 'wb'))

CPU times: user 23.3 s, sys: 608 ms, total: 23.9 s
Wall time: 24 s


In [9]:
def get_keywords(doc, n=5, only_words=False):
    tf_idf_vec = tf_idf.transform(cv.transform([doc]))
    
    coo = tf_idf_vec.tocoo()
    sorted_items = sorted(zip(coo.col, coo.data), key=lambda x: (x[1], x[0]), reverse=True)
    
    if only_words:
        return [features[idx] for (idx, _) in sorted_items[:n]]
    
    return {features[idx]: round(score, 3) for (idx, score) in sorted_items[:n]}

## Testing

In [10]:
k = random.choice(list(corpus_map.keys()))
keywords = get_keywords(corpus_map[k], n=10)

p = papers[papers.pdf == k]

print("\nTitle:")
print(str(p.title.values[0]))
print("\nAbstract:")
print(str(p.abstract.values[0]))
print("\nKeywords:")
for kw, score in keywords.items():
    print(f'{kw:>32s}: {score}')


Title:
An Overview of Hierarchical Task Network Planning

Abstract:
Hierarchies are the most common structure used to understand the world better. In galaxies, for instance, multiple-star systems are organised in a hierarchical system. Then, governmental and company organisations are structured using a hierarchy, while the Internet, which is used on a daily basis, has a space of domain names arranged hierarchically. Since Artificial Intelligence (AI) planning portrays information about the world and reasons to solve some of world\'s problems, Hierarchical Task Network (HTN) planning has been introduced almost 40 years ago to represent and deal with hierarchies. Its requirement for rich domain knowledge to characterise the world enables HTN planning to be very useful, but also to perform well. However, the history of almost 40 years obfuscates the current understanding of HTN planning in terms of accomplishments, planning models, similarities and differences among hierarchical planners