# Topic Modelling / Clinicaltrials.gov
## Import

In [1]:
#!pip install pyldavis

In [2]:
import string
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
pd.options.display.max_columns = 50
from tqdm.auto import tqdm
tqdm.pandas()

import textblob

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load('en_core_web_lg')

import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings('ignore')

## Create DF

In [3]:
#articles_df["Abstract"]=articles_df["Abstract"].astype(str)
url = 'https://raw.githubusercontent.com/benjaminschwetz/library_hackathon_topic/master/biofilm2019_scopus.csv'
df = pd.read_csv(url,index_col=0,parse_dates=[0])

In [4]:
# Creating a vectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
NZ_STOP_WORDS = ['biofilm', 'biofilms']
STOP_WORDS = list(ENGLISH_STOP_WORDS) + NZ_STOP_WORDS


#stop_words = {'english' + 'biofilm' + 'biofilms'}
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=STOP_WORDS, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df['Abstract'])

In [5]:
NUM_TOPICS = 10

# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=50, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)
print(data_lda)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of ma

In [6]:
# Function for printing keywords for each topic
test = {}

def selected_topics(model, vectorizer, top_n=20):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
                  

selected_topics(lda, vectorizer)


# from pprint import pprint
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]
vectorizer.get_feature_names()

Topic 0:
[('isolates', 732.6736724388818), ('strains', 417.5651614362668), ('resistance', 256.8613342366352), ('study', 202.7574453053388), ('genes', 194.5397885431567), ('gene', 164.87560319708152), ('samples', 154.84141599582472), ('patients', 141.35209763857029), ('associated', 138.69387063793621), ('isolated', 135.87480436254063), ('community', 126.60369719768525), ('showed', 122.70461612832534), ('sequencing', 117.58832623744026), ('water', 117.21568885474369), ('resistant', 116.86750548843574), ('results', 115.69278967733709), ('clinical', 115.52582013026334), ('aureus', 105.00800894330786), ('higher', 104.57980129491548), ('high', 103.78072437594957)]
Topic 1:
[('aeruginosa', 322.42082626627587), ('infections', 318.6097788687576), ('resistance', 298.5748574713453), ('antibiotic', 275.57247415835064), ('antibiotics', 224.0569036778009), ('infection', 204.61076997147612), ('bacterial', 204.4225719753158), ('bacteria', 195.54122281224662), ('treatment', 165.8980017370086), ('chroni

['-based',
 '-day',
 '-dependent',
 '-dimethylthiazol-',
 '-diphenyltetrazolium',
 '-fold',
 '-hour',
 '-hsl',
 '-lactam',
 '-lactamase',
 '-log',
 '-month',
 '-phosphate',
 '-week',
 '-well',
 '-year',
 '-year-old',
 '-yl',
 'aap',
 'abc',
 'abilities',
 'ability',
 'abiotic',
 'able',
 'abolished',
 'absence',
 'absent',
 'absolute',
 'absorbance',
 'absorbed',
 'absorption',
 'abstract',
 'abundance',
 'abundances',
 'abundant',
 'academic',
 'academy',
 'accelerate',
 'accelerated',
 'accepted',
 'acceptor',
 'access',
 'accessible',
 'accession',
 'accessory',
 'accompanied',
 'accordance',
 'according',
 'accordingly',
 'account',
 'accounted',
 'accounting',
 'accumulate',
 'accumulated',
 'accumulation',
 'accuracy',
 'accurate',
 'accurately',
 'acetate',
 'acetic',
 'achieve',
 'achieved',
 'achieving',
 'acid',
 'acidic',
 'acidithiobacillus',
 'acidity',
 'acidobacteria',
 'acidogenicity',
 'acids',
 'acinetobacter',
 'acnes',
 'acquired',
 'acquisition',
 'acrylic',
 'act'

In [None]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [None]:
# with open(LDAvis_data_filepath, 'w') as f:
#         pickle.dump(LDAvis_prepared, f)
        
# # load the pre-prepared pyLDAvis data from disk
# with open(LDAvis_data_filepath) as f:
#     LDAvis_prepared = pickle.load(f)
# pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

In [None]:
dash.token_table
#include the article of origin?