#### Importing libraries and installing dependencies

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
%matplotlib inline

#avoid all warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Define the randomness
np.random.seed(0)

In [None]:
#Import the NLP library and dowmload the stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#### Dataset

In [None]:
#Reading csv file of XML parsed ~6000 DOIs. <path> contains the file
df = pd.read_csv(r'C:\Users\Admin\OneDrive - IIT Delhi\CPCB\PROF. HARI\Jupyter_file\Ex_scibert\Abstract\final_abstract.csv')
df.head()

In [None]:
#44 missing values are dropped. Total of 5901 abstracts are processed
df.isnull().sum() #44 missing null values

In [None]:
df = df.dropna(axis=0)
df.head()

**ChemDataExtractor doesn't detect lowered text for catalysts/ molecular compounds. 
But lemmatization, stopword removal and punctuation removal are done prior to catalyst detection** 

#### Text Cleaning

In [None]:
#Cleaning
def clean(df, column):
    df[column] = df[column].str.replace('[^\w\s]',' ') #Removes any kind of punctuation
    df[column] = df[column].str.lstrip() #Strips of spaces from left and right ends of the abstract
    df[column] = df[column].str.rstrip()
    
#Stopword Removal
stop = stopwords.words('english')
def stop_remover(dfc, column):
    dfc[column] = dfc[column].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
    
clean(df, 'Abstract') #Text is cleaned for punctuation and spaces
clean(df, 'Title')

stop_remover(df, 'Abstract') #Stopwords are removed
stop_remover(df, 'Title')

df['Abstract_lemm'] = df['Abstract'].apply(lemmatize_text).apply(lambda x: " ".join(x)) #All the words are lemmatized to avaoing confusion 'alcohol' & 'alcohols'
df['Title_lemm'] = df['Title'].apply(lemmatize_text).apply(lambda x: " ".join(x))

df.head()

#### ChemDataExtractor

In [None]:
#library imported
from chemdataextractor import Document
import chemdataextractor

In [None]:
df['Compounds'] = 0

In [None]:
df = df.reset_index(drop=True)
df.head()

In [None]:
#Passing a text
for i in range(5901):
    doc = Document(df['Abstract'][i])
    array = np.array(doc.cems)
    listToStr = ' '.join([str(elem) for elem in array])
    comp_array = np.array(listToStr)
    s = comp_array.tolist().split()        
    s = list(set(s))
    listToStr2 = ' '.join([str(elem) for elem in s])
    df['Compounds'][i] = listToStr2
#Chemical entity mention
df['Compounds']

In [None]:
df.head()

#### Word cloud

In [None]:
#!pip install wordcloud

In [None]:
from wordcloud import WordCloud

# Joining all the processed lines together. Whole PDF
long_string = ','.join([str(i) for i in list(df['Compounds'].values)])

# WordCloud object
wordcloud = WordCloud(background_color="white", 
                      max_words=100000, 
                      contour_width=5, 
                      contour_color='steelblue',
                      repeat = False,
                      relative_scaling = 0.5,
                      min_font_size=3,
                      max_font_size = 40)
wordcloud.generate(long_string)

# Visualizing
wordcloud.to_image()

In [None]:
#Lemmatized abstract is converted to lower case
df['Abstract_lemm'] = df['Abstract_lemm'].str.lower()
df.head()

## LDA

* [Reference 1](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)
* [Reference 2](https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0)
* [Reference 3](https://towardsdatascience.com/topic-modeling-with-latent-dirichlet-allocation-e7ff75290f8)
* [Reference 4](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf)

In [None]:
#Tokenize words and further clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  
        
data = df['Abstract_lemm'].tolist()
data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

## Base Model

Code for Base model LDA with default alpha and beta values

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Coherence Score is a measure for performance of LDA. LDA model is run mutliple times in a loop to maximize the coherence score for hyperparameters number of topics, alpha and beta.

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

## Analysis

In [None]:
a1 = pd.read_csv(r'C:\Users\Admin\OneDrive - IIT Delhi\CPCB\PROF. HARI\Jupyter_file\Ex_scibert\Abstract\lda_tuning_results.csv')
a1.head()

In [None]:
dd = pd.DataFrame()
dd  = a1.loc[(a1.Alpha == '0.61') & (a1.Beta == '0.31') & (a1.Validation_Set == '75% Corpus')]
#dd = a1.loc[a1.Topics == 9]
dd.head()

In [None]:
dd.Coherence.nlargest(2)

In [None]:
a1.loc[221]

In [None]:
import matplotlib.pyplot as plt

x = dd['Topics']
y = dd['Coherence']

plt.figure(figsize = (16, 12), linewidth = 20)
plt.plot(x, y, c='r', linewidth=5)
plt.scatter(a1['Topics'], a1['Coherence'], linewidths=4)
plt.xlabel('Topics', fontweight='bold', fontsize=32)
plt.ylabel('Coherence Score', fontweight='bold', fontsize=32)
plt.xticks(fontweight='bold', fontsize=26)
plt.yticks(fontweight='bold', fontsize=26)
#plt.title('LDA: Topics vs Coherence Score (alpha=0.61, beta=0.61)', fontweight='bold', fontsize=28)
plt.savefig(r'C:\Users\Admin\OneDrive - IIT Delhi\CPCB\PROF. HARI\data_final\coherence.pdf', dpi=5000)
plt.show()


In [None]:
maximum_Coherence = a1.Coherence.max()
maxValueIndex = a1.Coherence.idxmax()
print(maximum_Coherence, maxValueIndex)

In [None]:
a1.loc[222]

## Final Model

In [None]:
import random 
random.seed(0)
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.61,
                                           eta=0.31)

In [None]:
pprint(lda_model.print_topics())

In [None]:
#import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
#most dominant topic for each document
doc_lda = lda_model[corpus]

corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in doc_lda]
topics = [[(term, round(wt, 3)) for term, wt in lda_model.show_topic(n, topn=10)] for n in range(0, lda_model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Term'+str(i) for i in range(1, 11)], index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T
topics_df.head()

In [None]:
# column width
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], columns = ['Terms per Topic'], index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)] )
topics_df.head()

In [None]:
topics_df.columns, topics_df['Terms per Topic'][0]

In [None]:
#Visualizing with word clouds
from wordcloud import WordCloud
#wordcloud object
wc = WordCloud(background_color="white", colormap="Dark2", max_font_size=150, random_state=42)

#figure size
plt.rcParams['figure.figsize'] = [20, 15]

#subplots for each topic
for i in range(9):

    wc.generate(text=topics_df["Terms per Topic"][i])
    
    plt.subplot(3, 3, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topics_df.index[i])

plt.show()

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(100)

In [None]:
df_dominant_topic.Dominant_Topic.value_counts()

In [None]:
topics_df.columns, topics_df['Terms per Topic'][8]

In [None]:
df_dominant_topic.Dominant_Topic.value_counts().plot(kind='bar', rot=0, figsize = (16, 12), linewidth = 20)
plt.xlabel("Topic Number", labelpad=14, fontweight='bold', fontsize=35)
plt.ylabel("Number of Documents", labelpad=14, fontweight='bold', fontsize=35)
plt.xticks(fontweight='bold', fontsize=26)
plt.yticks(fontweight='bold', fontsize=26)
#plt.savefig(r'C:\Users\Admin\OneDrive - IIT Delhi\CPCB\PROF. HARI\data_final\bar_plot.pdf', dpi=5000)
plt.show()

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(12)

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(12)

In [None]:
#most dominant topic for each document
doc_lda = lda_model[corpus]

corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in doc_lda]
topics = [[(term, round(wt, 3)) for term, wt in lda_model.show_topic(n, topn=50)] for n in range(0, lda_model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Term'+str(i) for i in range(1, 51)], index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T
topics_df.head()

In [None]:
# column width
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], columns = ['Terms per Topic'], index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)] )
topics_df

In [None]:
from wordcloud import WordCloud

# Joining all the processed lines together. Whole PDF
long_string = ','.join([str(i) for i in list(topics_df['Terms per Topic'][3])])
# WordCloud object
wordcloud = WordCloud(background_color="white",
                      max_words=100000, 
                      contour_width=5, 
                      contour_color='steelblue',
                      repeat = False,
                      relative_scaling = 0.5,
                      min_font_size=10,
                      max_font_size = 250,
                      width=1600,
                      height=600)
wordcloud.generate(text=topics_df["Terms per Topic"][7].upper())

# Visualizing
wordcloud.to_image()

In [None]:
from wordcloud import WordCloud

# Joining all the processed lines together. Whole PDF
long_string = ','.join([str(i) for i in list(topics_df['Terms per Topic'][4])])
# WordCloud object
wordcloud = WordCloud(background_color="white",
                      max_words=100000, 
                      contour_width=5, 
                      contour_color='steelblue',
                      repeat = False,
                      relative_scaling = 0.5,
                      min_font_size=10,
                      max_font_size = 250,
                      width=1600,
                      height=600)
wordcloud.generate(text=topics_df["Terms per Topic"][1].upper())

# Visualizing
wordcloud.to_image()

In [None]:
#Visualizing with word clouds
from wordcloud import WordCloud
#wordcloud object
wc = WordCloud(background_color="white", colormap="Dark2", max_font_size=150, random_state=42)

#figure size
plt.rcParams['figure.figsize'] = [20, 15]

#subplots for each topic
for i in range(9):

    wc.generate(text=topics_df["Terms per Topic"][i])
    
    plt.subplot(3, 3, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topics_df.index[i])

plt.show()

In [None]:
# Get topic weights and dominant topics
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors

from matplotlib import rc

# Get topic weights
# topic_weights = []
# for i, row_list in enumerate(lda_model[corpus]):
#     topic_weights.append([w for i, w in row_list[0]])
    
# n-1 rows each is a vector with i-1 posisitons, where n the number of documents
# i the topic number and tmp[i] = probability of topic i
topic_weights = []
for row_list in lda_model[corpus]:
    tmp = np.zeros(9)
    for i, w in row_list:
        tmp[i] = w
        topic_weights.append(tmp)
arr = pd.DataFrame(topic_weights).fillna(0).values


# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
topic_num_ = pd.Series(topic_num)
df = pd.DataFrame(tsne_lda)
df.columns = ['tsne_0', 'tsne_1']
df['label'] = topic_num_ 

unique = list(set(df['label']))

colors = [plt.cm.jet(float(i)/max(unique)) for i in unique]

plt.figure(figsize=(13,13))
for i, u in enumerate(unique):
    xi = [df['tsne_0'][j] for j  in range(len(df['tsne_0'])) if df['label'][j] == u]
    yi = [df['tsne_1'][j] for j  in range(len(df['tsne_1'])) if df['label'][j] == u]
    plt.scatter(xi, y=yi, c=colors[i], label='Topic_'+str(u))

plt.xlabel('t_SNE_1',fontweight='bold', fontsize= 24)
plt.ylabel('t_SNE_2',fontweight='bold', fontsize=24)

plt.rcParams['axes.linewidth'] = 2

plt.tick_params(axis="x", direction="in",width=2)
plt.tick_params(axis="y", direction="in", width=2)

rc('font', weight='bold')

plt.tick_params(bottom=True, top=True, left=True, right=True)
plt.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)

plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.legend(frameon=False,prop={'weight':'bold',"size":15})

plt.savefig(r'C:\Users\Admin\OneDrive - IIT Delhi\CPCB\PROF. HARI\paper\pipeline\tsne.pdf', dpi=5000)
plt.show()