<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M2-topicmodel-openalex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis
!pip install -qq --upgrade numpy

In [3]:
import pandas as pd
import numpy as np
import tqdm #progress bar

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA


# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


## Data Loading

In [4]:
# Load remote file - dataframe of 1072 publications records on NLP research from Openalex
data = pd.read_csv('https://raw.githubusercontent.com/AI-Growth-Lab/SciNerTopic/main/data/nlp_openalex.csv')

In [None]:
#@title Load Data from OpenAlex

#@markdown You can check out the list of concepts witht heir IDs [here](https://docs.google.com/spreadsheets/d/1LBFHjPt4rj_9r0t0TTAlT68NwOtNH8Z21lBMsJDMoZg/edit#gid=575855905), e.g., NLP c204321447 
# specify endpoint
endpoint = 'works'
concept = "'c204321447'" #@param {type:"string"}
oa = True #@param {type:"boolean"}
nDocs = 200 #@param {type:"slider", min:200, max:3000, step:1}
from_pub_date = "2017-01-01" #@param {type:"date"}
#@markdown Enter your email for API call to OpenAlex. It is not stored but just used for the API call to OpenAlex.
email = 'test@test.com'#@param {type:"string"} 


def OA(oa):
  if True:
    return 'true'
  else:
    return 'false'



oa_str = OA(oa)

# build the 'filter' parameter
filters = ",".join((
    f'concepts.id:{concept}',
    'is_paratext:false', 
    f'from_publication_date:{from_pub_date}',
    f'is_oa:{oa_str}'
))

# put the URL together
filtered_works_url = f'https://api.openalex.org/{endpoint}?mailto={email}&filter={filters}'
print(f'complete URL with filters:\n{filtered_works_url}')


paging_param = 'per-page=100&cursor=*'

works_query = f'{filtered_works_url}&{paging_param}'

response = requests.get(works_query)
meta = json.loads(response.text)['meta']
next_cursor = meta['next_cursor']
results_alx = json.loads(response.text)['results']


cycles = math.floor((meta['count'] - 100) / meta['per_page'])+1
if cycles > 30:
  cycles = int(nDocs/100)

df_input = []

for result in results_alx:
  if result['abstract_inverted_index']:
    abs = ' '.join(result['abstract_inverted_index'].keys())
    df_input.append((result['id'], result['doi'],result['title'],result['publication_year'],abs))

for cycle in range(cycles):
  cycle_query = f'{works_query[:-1]}{next_cursor}'
  response = requests.get(cycle_query)
  meta = json.loads(response.text)['meta']
  next_cursor = meta['next_cursor']
  results_alx = json.loads(response.text)['results']
  for result in results_alx:
    if result['abstract_inverted_index']:
      abs = ' '.join(result['abstract_inverted_index'].keys())
      df_input.append((result['id'], result['doi'],result['title'],result['publication_year'],abs))


data = pd.DataFrame(df_input, columns=['id','doi','title','publication_year','abstract'])

print(f'Downloaded {str(len(data))} documents')

data.head()

In [5]:
data.shape

(1072, 5)

In [6]:
data.columns

Index(['id', 'doi', 'title', 'publication_year', 'abstract'], dtype='object')

In [8]:
data

Unnamed: 0,id,doi,title,publication_year,abstract
0,https://openalex.org/W2962739339,https://doi.org/10.18653/v1/n18-1202,Deep Contextualized Word Representations,2018,We introduce a new type of deep contextualized...
1,https://openalex.org/W2965373594,https://doi.org/10.48550/arxiv.1907.11692,RoBERTa: A Robustly Optimized BERT Pretraining...,2019,Language model pretraining has led to signific...
2,https://openalex.org/W2493916176,https://doi.org/10.1162/tacl_a_00051,Enriching Word Vectors with Subword Information,2017,"Continuous word representations, trained on la..."
3,https://openalex.org/W1902237438,https://doi.org/10.18653/v1/d15-1166,Effective Approaches to Attention-based Neural...,2015,An attentional mechanism has lately been used ...
4,https://openalex.org/W2962784628,https://doi.org/10.18653/v1/p16-1162,Neural Machine Translation of Rare Words with ...,2016,Neural machine translation (NMT) models typica...
...,...,...,...,...,...
1067,https://openalex.org/W2963924212,https://doi.org/10.18653/v1/d17-1314,DOC: Deep Open Classification of Text Documents,2017,Traditional supervised learning makes the clos...
1068,https://openalex.org/W2105637130,https://doi.org/10.1093/jamia/ocv034,Toward high-throughput phenotyping: unbiased a...,2015,Analysis of narrative (text) data from electro...
1069,https://openalex.org/W2295584157,https://doi.org/10.3115/v1/n15-1028,Deep Multilingual Correlation for Improved Wor...,2015,Word embeddings have been found useful for man...
1070,https://openalex.org/W2951211142,https://doi.org/10.18653/v1/p19-1213,Ranking Generated Summaries by Correctness: An...,2019,While recent progress on abstractive summariza...


In [9]:
data ['text'] = data['title'] + '. ' + data['abstract']

In [11]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data['text']),position=0, leave=True)

for text in nlp.pipe(data['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

100%|█████████▉| 1070/1072 [00:10<00:00, 201.01it/s]

In [12]:
data

Unnamed: 0,id,doi,title,publication_year,abstract,text
0,https://openalex.org/W2962739339,https://doi.org/10.18653/v1/n18-1202,Deep Contextualized Word Representations,2018,We introduce a new type of deep contextualized...,Deep Contextualized Word Representations. We i...
1,https://openalex.org/W2965373594,https://doi.org/10.48550/arxiv.1907.11692,RoBERTa: A Robustly Optimized BERT Pretraining...,2019,Language model pretraining has led to signific...,RoBERTa: A Robustly Optimized BERT Pretraining...
2,https://openalex.org/W2493916176,https://doi.org/10.1162/tacl_a_00051,Enriching Word Vectors with Subword Information,2017,"Continuous word representations, trained on la...",Enriching Word Vectors with Subword Informatio...
3,https://openalex.org/W1902237438,https://doi.org/10.18653/v1/d15-1166,Effective Approaches to Attention-based Neural...,2015,An attentional mechanism has lately been used ...,Effective Approaches to Attention-based Neural...
4,https://openalex.org/W2962784628,https://doi.org/10.18653/v1/p16-1162,Neural Machine Translation of Rare Words with ...,2016,Neural machine translation (NMT) models typica...,Neural Machine Translation of Rare Words with ...
...,...,...,...,...,...,...
1067,https://openalex.org/W2963924212,https://doi.org/10.18653/v1/d17-1314,DOC: Deep Open Classification of Text Documents,2017,Traditional supervised learning makes the clos...,DOC: Deep Open Classification of Text Document...
1068,https://openalex.org/W2105637130,https://doi.org/10.1093/jamia/ocv034,Toward high-throughput phenotyping: unbiased a...,2015,Analysis of narrative (text) data from electro...,Toward high-throughput phenotyping: unbiased a...
1069,https://openalex.org/W2295584157,https://doi.org/10.3115/v1/n15-1028,Deep Multilingual Correlation for Improved Wor...,2015,Word embeddings have been found useful for man...,Deep Multilingual Correlation for Improved Wor...
1070,https://openalex.org/W2951211142,https://doi.org/10.18653/v1/p19-1213,Ranking Generated Summaries by Correctness: An...,2019,While recent progress on abstractive summariza...,Ranking Generated Summaries by Correctness: An...


In [13]:
data['text_clean'] = clean_text

In [14]:
# preprocess texts (we need tokens)
tokens = []

for summary in nlp.pipe(data['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [15]:
data['tokens'] = tokens

In [16]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['tokens']]

In [None]:
corpus

In [40]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

In [22]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [23]:
 # Let's Visualize
pyLDAvis.display(lda_display)

In [25]:
from gensim.models.coherencemodel import CoherenceModel

In [41]:
cm = CoherenceModel(model=lda_model, texts = data['tokens'], coherence='c_v')
coherence = cm.get_coherence()  # get coherence value

In [42]:
coherence

0.4113904704698316