In [None]:
import os
import glob
import pickle
from tqdm import tqdm
import random
import pickle

from collections import Counter
import pandas as pd

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import coo_matrix

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from common import Paper

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def get_papers():
    concat = []

    for f in tqdm(glob.glob('./papers/*.pkl')):
        _, field, year, month = os.path.basename(f).split('.')[0].split('-')
        raw = pickle.load(open(f, 'rb'))
        concat += [x.__dict__ for x in raw]

    return pd.DataFrame(concat)

## Global definitions

In [None]:
wnl = WordNetLemmatizer()
ps = PorterStemmer()

stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(["using", "show", "result", "large", 
                               "also", "iv", "one", "two", "new", 
                               "previously", "shown", "cite", "work", "other"
                               "however", "thus", "therefore", "while", "whilst", "continues"])

papers = get_papers()

## Clean dataset and construct corpus

#### Ensure $\geq$ N words in each abstract

In [None]:
MIN_WORDS = 20

papers['word_count'] = papers.loc[:, 'abstract'].apply(lambda x: len(str(x).split(" ")))
papers = papers[papers['word_count'] >= MIN_WORDS]

papers.word_count.describe()

In [None]:
uncommon = pd.Series(' '.join(papers['abstract']).split()).value_counts()[-20:]
# uncommon

In [None]:
def construct_corpus(dataset):
    corpus_map = {}
    
    for i in tqdm(range(len(dataset))):
        d = dataset.iloc[i]
        
        text = f'{d.title} {d.abstract}'
        text = re.sub('[^a-zA-Z-]', ' ', text)
        text = text.lower()
        text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ",text)

        text = ' '.join([wnl.lemmatize(w) for w in text.split() if not w in stop_words])
        
        corpus_map[d.pdf] = text
        
    return list(corpus_map.values()), corpus_map

corpus, corpus_map = construct_corpus(papers)

#### Wordcloud

In [None]:
# %%time

# wordcloud = WordCloud(background_color='white',
#                       stopwords=stop_words,
#                       max_words=100,
#                       max_font_size=50).generate(str(corpus))

# fig = plt.figure(figsize=(3, 2), dpi=250)
# plt.imshow(wordcloud, interpolation='lanczos')
# plt.axis('off')
# pass

#### Vectorizers

In [None]:
%%time

cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10_000, ngram_range=(1,2))
X = cv.fit_transform(corpus)

tf_idf = TfidfTransformer(smooth_idf=True, use_idf=True)
tf_idf.fit(X)

features = cv.get_feature_names()

pickle.dump([cv, tf_idf, features], open('vec/cv_tfidf_feat.pkl', 'wb'))
pickle.dump(corpus_map, open('vec/corpus_map.pkl', 'wb'))

In [None]:
def get_keywords(doc, n=5, only_words=False):
    tf_idf_vec = tf_idf.transform(cv.transform([doc]))
    
    coo = tf_idf_vec.tocoo()
    sorted_items = sorted(zip(coo.col, coo.data), key=lambda x: (x[1], x[0]), reverse=True)
    
    if only_words:
        return [features[idx] for (idx, _) in sorted_items[:n]]
    
    return {features[idx]: round(score, 3) for (idx, score) in sorted_items[:n]}

## Testing

In [None]:
k = random.choice(list(corpus_map.keys()))
keywords = get_keywords(corpus_map[k], n=10)

p = papers[papers.pdf == k]

print("\nTitle:")
print(str(p.title.values[0]))
print("\nAbstract:")
print(str(p.abstract.values[0]))
print("\nKeywords:")
for kw, score in keywords.items():
    print(f'{kw:>32s}: {score}')