# Getting data from ClinicalTrials.gov

In [None]:
# First get the clinical trial data into a dataframe
# We use pytrials, a Python wrapper around the clinicaltrials.gov API.
from pytrials.client import ClinicalTrials
import pandas as pd

# You create an instance of the ClinicalTrials() object 
ct = ClinicalTrials()

# Here is a list of the fields that would be returned in your results
fields = ["NCTId",
          "PrimaryCompletionDate",
          "BriefSummary", 
          "BriefTitle",
          "ResponsiblePartyInvestigatorAffiliation",
          "ResponsiblePartyInvestigatorFullName",
          "PrimaryOutcomeDescription",
          "Phase",
          "Condition",
          "ConditionMeshTerm",
          "LocationCountry"
            ]

# Note, the maximum rows that can be returned is only 1000!
data = ct.get_study_fields("Coronavirus+COVID", fields, max_studies=1000, fmt='csv')

# Convert to a dataframe and save as a csv file
clinical_trials = pd.DataFrame.from_records(data[1:], columns=data[0])
clinical_trials.to_csv("clinical_trials_nlp.csv")

# Text preprocessing

In [1]:
# see: https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

# Gensim libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import re
import pandas as pd
import string  # For punctuation removal

# Add custom stop words
sw_nltk = stopwords.words('english')
sw_nltk.extend(['trial', 
                "covid", 
                "covid-19", 
                "sars-cov-2",
                "coronavirus",
                "study",
                "infection",
                "s"])

# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    """Preprocess text in dataframe column (stopword removal, lowercase, punctuation removal[Note: no removal of / or -])"""
    # First split the text based on whitespace
    words = [word for word in text.split()]
     # Setup custom removal of punctuation (i.e. remove all but keep hyphens and backlash as these are important in scientific studies)
    remove = string.punctuation
    remove = remove.replace("-", "")  # don't remove hyphens
    remove = remove.replace("/", "")  # don't remove backslash
    pattern = r"[{}]".format(remove)  # create the pattern    
    words = [re.sub(pattern, "", w) for w in words]
    #lemmatize words and remove stopwords
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in sw_nltk]
    return words


# Read clinical trial data
df = pd.read_csv('clinical_trials_nlp.csv')

    
# Apply the preprocess function to the 'BriefSummary' column and return as a new column
df["Summary_tokens"] = df['BriefTitle'].apply(lambda x: preprocess(x))
# Create a list of documents from the column for preprocessing
doc_list = df['Summary_tokens'].to_list()

# Map word IDs to words.
words = gensim.corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

print(words)

Dictionary(2014 unique tokens: ['collection', 'new', 'outbreak', 'sample', 'south']...)


# Topic modelling

In [2]:
# LDA model tips here: https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2
# also here: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print out the topics and their scores from the LDA model
for i, v in lda_model.print_topics(num_words=10):
    print("topic {}:".format(i))
    print(v)
    
 

topic 0:
0.043*"disease" + 0.036*"2019" + 0.025*"patient" + 0.016*"therapy" + 0.010*"effect" + 0.010*"treatment" + 0.010*"safety" + 0.010*"inhalation" + 0.010*"cell" + 0.010*"evaluation"
topic 1:
0.039*"treatment" + 0.031*"convalescent" + 0.030*"plasma" + 0.025*"disease" + 0.019*"patient" + 0.018*"care" + 0.013*"safety" + 0.013*"2019" + 0.012*"clinical" + 0.012*"efficacy"
topic 2:
0.048*"patient" + 0.047*"disease" + 0.035*"efficacy" + 0.032*"safety" + 0.032*"evaluate" + 0.030*"2019" + 0.022*"severe" + 0.021*"hospitalized" + 0.012*"treatment" + 0.012*"participant"
topic 3:
0.028*"patient" + 0.015*"virus" + 0.014*"corona" + 0.014*"healthcare" + 0.013*"pneumonia" + 0.010*"response" + 0.010*"disease" + 0.009*"severe" + 0.008*"-" + 0.008*"worker"
topic 4:
0.061*"patient" + 0.017*"adult" + 0.016*"therapy" + 0.013*"vaccine" + 0.012*"impact" + 0.011*"disease" + 0.011*"safety" + 0.011*"clinical" + 0.010*"novel" + 0.009*"hospitalized"
topic 5:
0.055*"patient" + 0.022*"treatment" + 0.017*"clinica

In [3]:
# Apart from the bag of words model, lets also perform tf-idf vectorization and use the results for LDA modelling for comparrison
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print out the topics and their scores from the LDA model
for i, v in lda_model.print_topics(num_words=10):
    print("topic {}:".format(i))
    print(v)

topic 0:
0.010*"disease" + 0.010*"2019" + 0.009*"cancer" + 0.008*"patient" + 0.006*"therapy" + 0.006*"inhalation" + 0.005*"effect" + 0.005*"evaluation" + 0.005*"vaccination" + 0.005*"treatment"
topic 1:
0.009*"stem" + 0.007*"cell" + 0.007*"safety" + 0.007*"mesenchymal" + 0.007*"patient" + 0.007*"care" + 0.007*"covid19" + 0.007*"treatment" + 0.006*"monitoring" + 0.006*"efficacy"
topic 2:
0.015*"convalescent" + 0.014*"plasma" + 0.011*"disease" + 0.011*"treatment" + 0.008*"patient" + 0.008*"efficacy" + 0.008*"safety" + 0.008*"outpatient" + 0.008*"severe" + 0.008*"evaluate"
topic 3:
0.006*"patient" + 0.005*"safety" + 0.005*"telerehabilitation" + 0.005*"icu" + 0.004*"severity" + 0.004*"cohort" + 0.004*"corona" + 0.004*"pneumonia" + 0.004*"protect" + 0.004*"people"
topic 4:
0.010*"patient" + 0.008*"therapy" + 0.006*"hospitalized" + 0.005*"hydroxychloroquine" + 0.005*"disease" + 0.005*"impact" + 0.004*"efficacy" + 0.004*"diabetes" + 0.004*"covid19" + 0.004*"2"
topic 5:
0.009*"patient" + 0.009

# pyLDAvis

In [4]:
# pyLDavis to visualize the topics
# pyLDavis is a wrapper aroud
import pyLDAvis.gensim_models as gensimvis

#import pyLDAvis
#import pyLDAvis.gensim_models
from IPython.core.display import HTML

vis = pyLDAvis.gensim.prepare(topic_model=lda_model, 
                              corpus=corpus_tfidf, 
                              dictionary=words)


pyLDAvis.enable_notebook()
pyLDAvis.display(vis)


ModuleNotFoundError: No module named 'pyLDAvis'

In [None]:
!python -m pip install -U pyLDAvis

In [6]:
conda install -c conda-forge pyLDAvis

ValueError: The python kernel does not appear to be a conda environment.  Please use ``%pip install`` instead.

In [7]:
!conda info


     active environment : NLP
    active env location : C:\Users\Andrew\Anaconda3\envs\NLP
            shell level : 2
       user config file : C:\Users\Andrew\.condarc
 populated config files : C:\Users\Andrew\.condarc
          conda version : 4.9.2
    conda-build version : 3.18.10
         python version : 3.6.9.final.0
       virtual packages : __cuda=10.2=0
                          __win=0=0
                          __archspec=1=x86_64
       base environment : C:\Users\Andrew\Anaconda3  (writable)
           channel URLs : https://repo.anaconda.com/pkgs/main/win-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/win-64
                          https://repo.anaconda.com/pkgs/r/noarch
                          https://repo.anaconda.com/pkgs/msys2/win-64
                          https://repo.anaconda.com/pkgs/msys2/noarch
          package cache : C:\Users\Andrew\Anaconda3\pkgs
                   