In [1]:
#Reading the dataset

import pandas as pd
med = pd.read_csv(r"Pubmed5k.csv")
med.head()

Unnamed: 0,ArticleID,Title,Abstract
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...


In [2]:
#Importing the ntlk package to apply the preprocessing

import nltk
# nltk.download()
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=set(nltk.corpus.stopwords.words('english'))

In [3]:
#preprocessing function (lemmitization >> Tokenization >> filtering stopwords)

def clean_text(headline):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(headline)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
      cleaned_text=" ".join(tokens)
      return cleaned_text
med['cleaned_text']=med['Abstract'].apply(clean_text)

In [4]:
#Vectorizing the preprocessed texts

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(med['cleaned_text'])
#print(vect_text)

In [5]:
# Parameters tuning using Grid Search

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
grid_params = {'n_components' : list(range(5,10))}

# LDA model

lda = LatentDirichletAllocation(learning_method='online', random_state=42,max_iter=1)
lda_model = GridSearchCV(lda,param_grid=grid_params)
lda_model.fit(vect_text)

# Estimators for LDA model

lda_model1 = lda_model.best_estimator_
print("Best LDA model's params" , lda_model.best_params_)

# Model Evaluation

print("Best log likelihood Score for the LDA model",lda_model.best_score_)
print("Best log likelihood Score for the LDA model",lda_model1.score(vect_text))
print("LDA model Perplexity on train data", lda_model1.perplexity(vect_text))

Best LDA model's params {'n_components': 5}
Best log likelihood Score for the LDA model -44296.76733357087
Best log likelihood Score for the LDA model -202110.24496523768
LDA model Perplexity on train data 1526.4809764816114


In [6]:
for i,topic in enumerate(lda_model1.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['composition', 'face', 'plant', 'concentration', 'specie', 'community', 'diet', 'black', 'water', 'dental']


Top 10 words for topic #1:
['health', 'score', 'year', 'intervention', 'risk', 'group', 'covid', '19', 'study', 'patient']


Top 10 words for topic #2:
['healthcare', 'research', 'pandemic', 'service', 'mental', 'professional', 'student', 'social', 'care', 'health']


Top 10 words for topic #3:
['bone', 'surgery', 'surgical', 'clinical', 'treatment', 'present', 'implant', 'review', 'case', 'patient']


Top 10 words for topic #4:
['based', 'model', 'expression', 'effect', 'method', 'pathway', 'protein', 'gene', 'cell', 'specie']




In [7]:
#Visualising the terms in each Topic

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model1, vect_text,vect, mds="tsne")

  default_term_info = default_term_info.sort_values(


In [8]:
#Transforming the fitted values to get propabilities for each topic

topic_values = lda_model1.transform(vect_text)
topic_values.shape
print(topic_values)

[[0.03596092 0.85399673 0.03646735 0.03694185 0.03663315]
 [0.03109643 0.87264039 0.03164967 0.03257952 0.03203399]
 [0.02819497 0.88501672 0.02908225 0.02877492 0.02893115]
 ...
 [0.03455543 0.74646614 0.14891569 0.03489741 0.03516533]
 [0.03194137 0.87040682 0.03275368 0.03235134 0.03254679]
 [0.03378691 0.84055471 0.05665863 0.03478244 0.03421731]]


In [9]:
# Getting the highest 3 topic probabilities for each document as requested

def sort_index(lst):
    index = range(len(lst))
    s = sorted(index, reverse=True, key=lambda i: lst[i])
    return s
Top_3_Matches = pd.DataFrame(sort_index(c)[:3] for c in topic_values)
Top_3_Matches.columns=["1st_match", "2nd_match","3rd_match"]
Top_3_Matches

Unnamed: 0,1st_match,2nd_match,3rd_match
0,1,3,4
1,1,3,4
2,1,2,4
3,1,3,2
4,1,2,3
...,...,...,...
4994,4,2,3
4995,2,3,1
4996,1,2,4
4997,1,2,4


In [10]:
#Adding the results to the main dataset

Topics_Extracted= pd.concat([med,Top_3_Matches],axis=1)
Topics_Extracted.to_excel("Topics_Extracted.xlsx")

Remarks:

a- For further optimisation we may use Genism.

b- To get the best no. of topics: 

1- elbow : coherence value vs no. of topics
2- perplexity: as implemented
3- create LDA models across different topic numbers, then check the Jaccard similarity and coherence for each, The ideal number of topics will maximize coherence and minimize the topic overlap based on Jaccard similarity.

c- Appling gridsearch on the topics number will automatically lead to the lowest no. as it has the lowest perplexity and high score due to the bugs.