In [2]:
# Import libraries
import pandas as pd
import gensim
import nltk
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\Yedidia
[nltk_data]     AGNIMO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Yedidia
[nltk_data]     AGNIMO\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Prepare data

- [x] Remove stop words (and word with less than 3 letters, example "eat" no-sense, to be check later)
- [ ] Lemmatization (transform words into their radical form)

In [3]:
data = pd.read_csv("../assets/data/train.csv")

In [4]:
print(data.shape)
data.head()

(20972, 9)


Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [21]:
# Lemmatization
stemmer = SnowballStemmer("english")


def lemmatize_stemming(text):
  return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='n'))


# Delete stopwords
def preprocess(text) :
  result = []
  for token in gensim.utils.simple_preprocess(text) :
    if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 :
      result.append(lemmatize_stemming(token))
  return result

# Extracting lemma of each word in each "document" (asbtract of research paper)
processed_docs = [preprocess(doc) for doc in data["ABSTRACT"][:200]]

In [22]:
print(type(processed_docs), len(processed_docs))

<class 'list'> 200


## Convert data.

* Data is stokcked into a `gensim`'s dictionary, then convert into **Bag of Words**. That is in couples ("word": occurence_count)

In [23]:
# Create dictionary.
dictionary = gensim.corpora.Dictionary(processed_docs)

# tokens in at least `no_below` documents, in at most (`no_above` * nb_documents) documents
# keep only the `keep_n` most frequent tokens
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=50)

dictionary

<gensim.corpora.dictionary.Dictionary at 0x1d1027d7c70>

In [24]:
print("Document frequencies:", dictionary.dfs)
print("Number of documents processed:", dictionary.num_docs)
print("Number of corpus position (number of processed words):", dictionary.num_pos)
print("Number of non-zeroes in BOW matrix (sum of unique words per document):", dictionary.num_nnz)
print("Dictionary", dictionary.token2id)

# print(dictionary.__doc__)

Document frequencies: {8: 18, 11: 20, 1: 15, 3: 20, 7: 17, 9: 17, 10: 20, 5: 17, 2: 15, 12: 18, 4: 16, 0: 18, 6: 16, 17: 20, 16: 20, 14: 16, 15: 15, 13: 19, 19: 20, 21: 17, 20: 18, 18: 20, 23: 18, 26: 19, 24: 15, 22: 15, 25: 20, 27: 17, 29: 20, 31: 17, 30: 15, 28: 16, 33: 16, 32: 16, 34: 15, 36: 16, 35: 18, 37: 17, 39: 18, 40: 20, 38: 16, 41: 17, 43: 19, 45: 16, 42: 16, 44: 19, 46: 16, 49: 16, 48: 15, 47: 18, 50: 15, 51: 16, 52: 20, 53: 17, 54: 16, 55: 19, 56: 19, 58: 20, 57: 16, 59: 16, 60: 16, 61: 18, 62: 15, 63: 16, 64: 19, 65: 15, 66: 20, 67: 17}
Number of documents processed: 200
Number of corpus position (number of processed words): 15871
Number of non-zeroes in BOW matrix (sum of unique words per document): 11187
Dictionary {'dataset': 0, 'detect': 1, 'exampl': 2, 'exist': 3, 'experiment': 4, 'finit': 5, 'imag': 6, 'improv': 7, 'predict': 8, 'reduc': 9, 'sampl': 10, 'specif': 11, 'type': 12, 'multipl': 13, 'neural': 14, 'posit': 15, 'task': 16, 'valu': 17, 'approxim': 18, 'equat

In [25]:
# Bag of Words
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0][:5]

[(0, 2), (1, 6), (2, 1), (3, 1), (4, 1)]

## LDA

In [33]:
# LDA model. THe number of topics per document has to be determined.
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 20, id2word = dictionary, passes = 1000)

In [34]:
topics = []
for idx, topic in lda_model.print_topics(-1) :
    print("Topic: {} -> Words: {}".format(idx, topic))
    topics.append(topic)

Topic: 0 -> Words: 0.223*"identifi" + 0.169*"implement" + 0.113*"standard" + 0.071*"need" + 0.057*"signific" + 0.055*"oper" + 0.044*"avail" + 0.043*"appli" + 0.039*"rang" + 0.034*"framework"
Topic: 1 -> Words: 0.243*"specif" + 0.146*"multipl" + 0.100*"neural" + 0.079*"valu" + 0.052*"set" + 0.051*"exampl" + 0.049*"type" + 0.045*"posit" + 0.034*"establish" + 0.031*"techniqu"
Topic: 2 -> Words: 0.262*"detect" + 0.163*"system" + 0.062*"task" + 0.041*"signific" + 0.041*"interact" + 0.041*"control" + 0.041*"avail" + 0.041*"oper" + 0.041*"investig" + 0.023*"real"
Topic: 3 -> Words: 0.153*"interact" + 0.122*"point" + 0.108*"finit" + 0.089*"bound" + 0.067*"equat" + 0.057*"lead" + 0.045*"special" + 0.045*"implement" + 0.040*"type" + 0.027*"appli"
Topic: 4 -> Words: 0.316*"class" + 0.126*"posit" + 0.092*"correspond" + 0.072*"set" + 0.065*"random" + 0.055*"group" + 0.052*"finit" + 0.032*"multipl" + 0.027*"exampl" + 0.020*"identifi"
Topic: 5 -> Words: 0.168*"group" + 0.138*"reduc" + 0.080*"possibl"

## Model coherence

In [35]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.24403910529390135


In [39]:
all_topic_model = []
for i in range(len(topics)):
  str = topics[i].split(' + ')
  topic_model = []
  for j in range(10):
    weight = str[j][0:5]
    word = str[j][7:len(str[j])-1]
    topic_model.append((weight, word))
  all_topic_model.append(topic_model)

df_topic_model = pd.DataFrame(all_topic_model)
mapper = {idx: f"Topic {idx}" for idx in df_topic_model.index} 
df_topic_model.rename(index = mapper)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic 0,"(0.223, identifi)","(0.169, implement)","(0.113, standard)","(0.071, need)","(0.057, signific)","(0.055, oper)","(0.044, avail)","(0.043, appli)","(0.039, rang)","(0.034, framework)"
Topic 1,"(0.243, specif)","(0.146, multipl)","(0.100, neural)","(0.079, valu)","(0.052, set)","(0.051, exampl)","(0.049, type)","(0.045, posit)","(0.034, establish)","(0.031, techniqu)"
Topic 2,"(0.262, detect)","(0.163, system)","(0.062, task)","(0.041, signific)","(0.041, interact)","(0.041, control)","(0.041, avail)","(0.041, oper)","(0.041, investig)","(0.023, real)"
Topic 3,"(0.153, interact)","(0.122, point)","(0.108, finit)","(0.089, bound)","(0.067, equat)","(0.057, lead)","(0.045, special)","(0.045, implement)","(0.040, type)","(0.027, appli)"
Topic 4,"(0.316, class)","(0.126, posit)","(0.092, correspond)","(0.072, set)","(0.065, random)","(0.055, group)","(0.052, finit)","(0.032, multipl)","(0.027, exampl)","(0.020, identifi)"
Topic 5,"(0.168, group)","(0.138, reduc)","(0.080, possibl)","(0.073, improv)","(0.063, exampl)","(0.057, surfac)","(0.049, neural)","(0.040, requir)","(0.038, object)","(0.037, need)"
Topic 6,"(0.165, object)","(0.122, neural)","(0.115, task)","(0.110, dataset)","(0.078, novel)","(0.073, import)","(0.062, featur)","(0.049, predict)","(0.039, detect)","(0.034, signific)"
Topic 7,"(0.471, power)","(0.085, previous)","(0.068, type)","(0.067, chang)","(0.053, requir)","(0.051, long)","(0.047, standard)","(0.035, identifi)","(0.021, simpl)","(0.020, posit)"
Topic 8,"(0.243, approxim)","(0.141, featur)","(0.095, task)","(0.088, control)","(0.080, improv)","(0.036, signific)","(0.036, standard)","(0.036, strong)","(0.036, lead)","(0.036, need)"
Topic 9,"(0.139, challeng)","(0.114, investig)","(0.094, object)","(0.081, predict)","(0.081, long)","(0.067, oper)","(0.064, real)","(0.052, task)","(0.049, improv)","(0.033, rang)"


In [40]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
     ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
      --------------------------------------- 0.0/2.6 MB 1.9 MB/s eta 0:00:02
     - -------------------------------------- 0.1/2.6 MB 1.6 MB/s eta 0:00:02
     - -------------------------------------- 0.1/2.6 MB 1.6 MB/s eta 0:00:02
     - -------------------------------------- 0.1/2.6 MB 901.1 kB/s eta 0:00:03
     -- ------------------------------------- 0.2/2.6 MB 701.4 kB/s eta 0:00:04
     --- ------------------------------------ 0.2/2.6 MB 885.4 kB/s eta 0:00:03
     ---- ----------------------------------- 0.3/2.6 MB 1.1 MB/s eta 0:00:03
     ------- -------------------------------- 0.5/2.6 MB 1.3 MB/s eta 0:00:02
     -------- ------------------------------- 0.5/2.6 MB 1.4 MB/s eta 0:00:02
     ----------- ---------------------------- 0.7/2.6 MB 1.6 MB/s eta 0:00:02
     ------------ --------------------------- 0.8/2.6 MB 1.7 MB/s et

In [41]:
import pyLDAvis.gensim_models

In [42]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)