In [4]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import os

import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.util import ngrams
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


from wordcloud import WordCloud
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
tqdm.pandas()

# Topic Modelling of Clinical Trials titles

In this notebook we approach finding characteristic topics present in titles of clinical trials. 

**Methodology**:
- Most frequent words, bi-grams and tri-grams
- TF-IDF vectorisation


**Limitations:**



## Loading Data

In [5]:
articles_path = '/Users/kuba/Desktop/PBL/EDA/articles/articles_01-01-2024_14h39m35s.csv'
trials_path = '/Users/kuba/Desktop/PBL/EDA/trials/trials_01-01-2024_14h39m35s.csv'

articles = pd.read_csv(articles_path, index_col=0)
trials = pd.read_csv(trials_path, index_col=0)

## Utilities

In [7]:
# Function to create n-grams and their frequencies
def n_gram_frequency(df, text_column, ngram_range=(2, 2), col_name='n-gram'):
    # Instantiate CountVectorizer for n-grams with binary=True
    vectorizer = CountVectorizer(ngram_range=ngram_range, binary=True)
    
    # Apply the vectorizer to the text documents
    X = vectorizer.fit_transform(df[text_column])
    
    # Sum up the occurrences of each n-gram across all documents
    frequencies = X.sum(axis=0)
    
    # Create a DataFrame with n-grams and their frequencies
    n_gram_df = pd.DataFrame(frequencies.T, index=vectorizer.get_feature_names_out(), columns=['frequency'])
    n_gram_df = n_gram_df.reset_index().rename(columns={'index': col_name})
    
    return n_gram_df

In [17]:
def preprocess_lemma(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, list):
        words = [word for sublist in text for word in word_tokenize(sublist.lower())]
    else:
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(text.lower())

    return ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()])

In [115]:
class TfidfTopFeatures:
    def __init__(self, ngram_range=(1,1), top_n=3):
        self.ngram_range = ngram_range
        self.top_n = top_n
        self.tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    def fit_transform(self, documents):
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
        self.tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=self.tfidf_vectorizer.get_feature_names_out())

    def get_top_features_per_document(self):
        top_features_df = self.tfidf_df.apply(lambda x: pd.Series(x.nlargest(self.top_n).index), axis=1)
        column_names = [f'top{i}_feature' for i in range(1, self.top_n + 1)]
        top_features_df.columns = column_names
        return top_features_df

    def get_top_features_counts(self):
        top_features_df = self.get_top_features_per_document()
        top_features_counts = top_features_df.apply(lambda x: pd.Series(x.dropna().values)).stack().value_counts()
        return top_features_counts

In [185]:
class LDAModel:
    def __init__(self, n_components=10, ngram_range=(1,1), n_top_words=10):
        self.n_components = n_components
        self.ngram_range = ngram_range
        self.n_top_words = n_top_words
        self.vectorizer = CountVectorizer(ngram_range=self.ngram_range)
        self.lda = LatentDirichletAllocation(n_components=self.n_components, random_state=42)

    def fit_transform(self, documents, titles):
        self.X = self.vectorizer.fit_transform(documents)
        self.titles = titles
        self.lda.fit(self.X)
        self.feature_names = self.vectorizer.get_feature_names_out()

    def describe_topics(self):
        topics = {}
        for topic_idx, topic in enumerate(self.lda.components_):
            topics["Topic" + str(topic_idx)] = [self.feature_names[i]
                            for i in topic.argsort()[:-self.n_top_words - 1:-1]]
        return topics
    
    def display_topics(self):
        for topic_idx, topic in enumerate(self.lda.components_):
            print("Topic %d:" % (topic_idx))
            print("|".join([self.feature_names[i]
                            for i in topic.argsort()[:-self.n_top_words - 1:-1]]))

    def get_document_topic_df(self):
        lda_output = self.lda.transform(self.X)
        topicnames = ["Topic" + str(i) for i in range(self.lda.n_components)]
        df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=self.titles)
        dominant_topic = np.argmax(df_document_topic.values, axis=1)
        df_document_topic['dominant_topic'] = dominant_topic
        return df_document_topic

## Preprocessing

In [10]:
trials.head()

Unnamed: 0,trial_id,discovery_date,title,summary,link,published_date,source,relevant,Sources__source_id,Sources__name,Sources__link
0,549142,2023-08-02 12:40:50.000000,"Development, Reliability and Validity of the T...",<b>Condition</b>: Neurological Disease<br /...,http://classic.clinicaltrials.gov/ct2/show/NCT...,2023-08-02 16:00:00.000000,12,,12,Clinical Trials.gov,https://clinicaltrials.gov/ct2/results/rss.xml...
1,548852,2023-07-31 16:20:47.742577,Impact of the Cionic Neural Sleeve on Mobility...,<b>Conditions</b>: Multiple Sclerosis; Mu...,https://clinicaltrials.gov/ct2/show/NCT05964829,2023-07-18 00:00:00.000000,17,,17,WHO XML import,https://trialsearch.who.int/
2,548149,2023-07-27 13:50:48.271892,Investigation The Effect of Conventional Vs. I...,<b>Conditions</b>: Multiple Sclerosis; Ne...,https://clinicaltrials.gov/ct2/show/NCT05962281,2023-02-06 00:00:00.000000,17,,17,WHO XML import,https://trialsearch.who.int/
3,548148,2023-07-27 13:50:48.231152,Cladribine vs Placebo for Non-active Progressi...,"<b>Conditions</b>: Multiple Sclerosis, Seco...",https://clinicaltrials.gov/ct2/show/NCT05961644,2023-07-18 00:00:00.000000,17,,17,WHO XML import,https://trialsearch.who.int/
4,548147,2023-07-27 13:50:48.188463,Montpellier PROspective Cohort in Relapsing Re...,<b>Condition</b>: Multiple Sclerosis<br /><...,https://clinicaltrials.gov/ct2/show/NCT05962177,2023-06-15 00:00:00.000000,17,,17,WHO XML import,https://trialsearch.who.int/


In [21]:
trials.dropna(subset=['title'], inplace=True)

In [23]:
documents = trials.title.apply(preprocess_lemma)

In [29]:
trials['documents_processed'] = documents.copy()

## Term Frequencies

### Single Words

In [30]:
n_gram_frequency(trials, 'documents_processed', ngram_range=(1, 1), col_name='unigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,unigram,frequency
3015,multiple,3002
4289,sclerosis,2970
3486,patient,1485
4625,study,1348
1385,effect,788
4243,safety,692
1395,efficacy,544
4968,treatment,480
4042,relapsing,403
1568,evaluate,298


### Bi-grams

In [39]:
n_gram_frequency(trials, 'documents_processed', ngram_range=(2, 2), col_name='bigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,bigram,frequency
10167,multiple sclerosis,2954
11638,patient multiple,462
14660,sclerosis patient,289
5209,efficacy safety,273
15765,study evaluate,246
13570,relapsing multiple,183
11861,people multiple,172
13579,relapsingremitting multiple,170
12818,progressive multiple,169
14291,safety tolerability,165


### Tri-Grams

In [34]:
n_gram_frequency(trials, 'documents_processed', ngram_range=(3, 3), col_name='trigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,trigram,frequency
14572,patient multiple sclerosis,460
12622,multiple sclerosis patient,288
17024,relapsing multiple sclerosis,178
14878,people multiple sclerosis,172
16110,progressive multiple sclerosis,167
17040,relapsingremitting multiple sclerosis,167
17171,remitting multiple sclerosis,132
17032,relapsing remitting multiple,131
19913,study evaluate efficacy,86
6861,evaluate efficacy safety,84


In [35]:
# removing terms 'multiple' and 'sclerosis' from documents_processed column (text) of trials dataframe
trials['documents_processed_filtered'] = trials['documents_processed'].apply(lambda x: x.replace('multiple sclerosis', ''))

In [36]:
# single word frequency on new column
n_gram_frequency(trials, 'documents_processed_filtered', ngram_range=(1, 1), col_name='unigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,unigram,frequency
3487,patient,1485
4619,study,1348
1385,effect,788
4244,safety,692
1395,efficacy,544
4962,treatment,480
4043,relapsing,403
1568,evaluate,298
4836,therapy,286
3930,randomized,258


In [37]:
# bigram frequency on new column
n_gram_frequency(trials, 'documents_processed_filtered', ngram_range=(2, 2), col_name='bigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,bigram,frequency
5170,efficacy safety,273
15374,study evaluate,247
14350,safety tolerability,165
13576,relapsing remitting,141
13015,quality life,131
14257,safety efficacy,123
11618,patient relapsing,121
13102,randomized doubleblind,101
5503,evaluate efficacy,94
15285,study ass,81


In [38]:
# trigram frequency on new column
n_gram_frequency(trials, 'documents_processed_filtered', ngram_range=(3, 3), col_name='trigram').sort_values(by='frequency', ascending=False).head(20)

Unnamed: 0,trigram,frequency
18179,study evaluate efficacy,86
6482,evaluate efficacy safety,84
18211,study evaluate safety,58
6055,efficacy safety tolerability,52
15248,quality life patient,43
15385,randomized doubleblind placebocontrolled,42
11540,multicenter randomized doubleblind,42
13497,patient relapsing remitting,40
18162,study efficacy safety,39
15899,relapsing remitting rrms,33


## TF-IDF Vectorisation - tokens with highest value per document

### Single token TF-IDF

In [116]:
tfidf_top_features = TfidfTopFeatures(ngram_range=(1, 1), top_n=3)
tfidf_top_features.fit_transform(trials['documents_processed'])

In [117]:
top_features_per_document = tfidf_top_features.get_top_features_per_document()
print(top_features_per_document.top1_feature.value_counts().head(20))

top1_feature
exercise              11
simvastatin           10
ofatumumab            10
ocrelizumab           10
de                    10
direct                 9
rituximab              9
pediatric              9
balance                8
extension              8
higher                 8
remibrutinib           8
mg                     8
fingolimod             8
fenebrutinib           7
telerehabilitation     7
sexual                 7
supplementation        7
progressive            7
mesenchymal            6
Name: count, dtype: int64


In [118]:
# top 3 features cumulated
top_features_counts = tfidf_top_features.get_top_features_counts()
print(top_features_counts.head(20))

sclerosis             138
effect                 82
study                  69
patient                65
treatment              58
exercise               57
fatigue                49
aac                    48
efficacy               45
training               44
progressive            43
natalizumab            37
extension              35
relapsing              34
cognitive              33
rehabilitation         33
ocrelizumab            33
relapsingremitting     32
person                 32
ofatumumab             32
Name: count, dtype: int64


### Bi-Gram TF-IDF

In [121]:
tfidf_top_bigrams = TfidfTopFeatures(ngram_range=(2, 2), top_n=3)
tfidf_top_bigrams.fit_transform(trials['documents_processed_filtered'])

In [122]:
# top 1 features per document
top_features_per_document = tfidf_top_bigrams.get_top_features_per_document()
print(top_features_per_document.top1_feature.value_counts().head(20))

top1_feature
aac device                    32
safety patient                 7
tolerability patient           6
ofatumumab compared            5
dose ocrelizumab               5
doubleblind noninferiority     4
progressive perseus            4
efficacy patient               4
arm multicenter                3
patient walking                3
adaptive design                3
safety fingolimod              3
ocrelizumab treatment          3
fingolimod capsule             3
remibrutinib versus            3
familial amyot                 3
doubleblinded phase            3
comparing teriflunomide        3
followed open                  3
study patient                  3
Name: count, dtype: int64


In [124]:
# top 3 feature cumulated
top_features_counts = tfidf_top_bigrams.get_top_features_counts()
print(top_features_counts.head(20))

aac device                    530
aan glatirameer               180
aanvalsgewijze multipele       32
study patient                  13
patient relapsing              11
stem cell                      10
patient relapsingremitting      9
safety efficacy                 9
therapy patient                 9
safety patient                  9
dose ocrelizumab                8
fenebrutinib compared           8
fatigue patient                 8
efficacy patient                8
tolerability patient            8
mesenchymal stem                8
cognitive impairment            8
training patient                8
extension study                 7
efficacy safety                 7
Name: count, dtype: int64


### Tri-gram TF-IDF

In [125]:
tfidf_top_trigrams = TfidfTopFeatures(ngram_range=(3, 3), top_n=3)
tfidf_top_trigrams.fit_transform(trials['documents_processed_filtered'])

In [126]:
# top 1 features per document
top_features_per_document = tfidf_top_trigrams.get_top_features_per_document()
print(top_features_per_document.top1_feature.value_counts().head(20))

top1_feature
aac device intervention                      180
compared teriflunomide patient                 5
placebo participant primary                    4
doubleblind noninferiority study               4
safety patient relapsing                       4
arm multicenter extension                      3
participant relapsing followed                 3
adaptive design study                          3
tolerability patient progressive               3
control disease ponesimod                      3
effectiveness patientreported outcomespro      3
participant nonrelapsing secondary             3
ocrelizumab treatment patient                  3
follow study look                              3
comparing teriflunomide participant            3
controlled study evaluating                    3
mesenchymal stem cell                          3
doubleblinded phase study                      3
compared subject relapsing                     3
extension rollover study                       3
Name: c

In [127]:
# top 3 feature cumulated
top_features_counts = tfidf_top_trigrams.get_top_features_counts()
print(top_features_counts.head(20))

aac device intervention              993
aan glatirameer acetaat              530
aanvalsgewijze multipele sclerose    180
mesenchymal stem cell                  8
safety tolerability patient            8
dose ocrelizumab adult                 6
safety efficacy patient                5
higher dose ocrelizumab                5
efficacy safety patient                5
safety ofatumumab compared             5
compared teriflunomide patient         5
ofatumumab compared teriflunomide      5
primary progressive perseus            4
stem cell transplantation              4
placebo participant primary            4
ocrelizumab adult relapsing            4
comparing placebo participant          4
participant relapsing form             4
safety remibrutinib versus             4
study evobrutinib participant          4
Name: count, dtype: int64


## LDA Topic Modeling

In [186]:
lda_model = LDAModel(n_components=10, ngram_range=(2,3), n_top_words=5)
lda_model.fit_transform(trials['documents_processed_filtered'], trials['title'])
df_document_topic = lda_model.get_document_topic_df()
topics = lda_model.describe_topics()

In [187]:
lda_model.display_topics()

Topic 0:
relapsing remitting|study evaluate|efficacy safety|patient relapsing|ocrelizumab adult
Topic 1:
efficacy safety|relapsing remitting|quality life|randomized doubleblind|safety efficacy
Topic 2:
relapsing remitting|efficacy safety|quality life|controlled trial|randomized controlled
Topic 3:
efficacy safety|study evaluate|safety tolerability|evaluate efficacy|evaluate efficacy safety
Topic 4:
relapsing remitting|study ass|patient relapsing|efficacy safety|quality life
Topic 5:
efficacy safety|study evaluate|direct current|direct current stimulation|current stimulation
Topic 6:
quality life|physical activity|pilot study|open label|observational study
Topic 7:
study evaluate|evaluate safety|study evaluate safety|safety efficacy|doubleblind placebocontrolled
Topic 8:
study evaluate|relapsing remitting|safety tolerability|efficacy safety|quality life
Topic 9:
quality life|patient relapsing|relapsing form|btk inhibitor|safety tolerability


In [182]:
df_document_topic

Unnamed: 0_level_0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Development, Reliability and Validity of the Telerehabilitation Usability Questionnaire- TrUQ",0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
Impact of the Cionic Neural Sleeve on Mobility in Multiple Sclerosis,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.89,0.01,0.01,7
Investigation The Effect of Conventional Vs. Individualized tDCS Intensity to Achieve Uniform E-Fields,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,0.01,0.01,7
Cladribine vs Placebo for Non-active Progressive Multiple Sclerosis (CLASP-MS).,0.01,0.01,0.01,0.01,0.89,0.01,0.01,0.01,0.01,0.01,4
Montpellier PROspective Cohort in Relapsing Remitting Multiple Sclerosis Using Imaging and Serologic,0.01,0.01,0.01,0.01,0.01,0.94,0.01,0.01,0.01,0.01,5
...,...,...,...,...,...,...,...,...,...,...,...
"A 2-year randomized, 3-arm, double-blind, non-inferiority study comparing the efficacy and safety of ofatumumab and siponimod versus fingolimod in pediatric patients with multiple sclerosis followe...",0.00,0.97,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
Antibody Response to COVID-19 Vaccines in Liver Disease Patients,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.91,0.01,8
"Study to Evaluate the Safety, Tolerability, and Immunogenicity of SARS CoV-2 RNA Vaccine Candidate (BNT162b2) Against COVID-19 in Healthy Pregnant Women 18 Years of Age and Older",0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.97,0.00,0.00,7
National Cohort Study of Effectiveness and Safety of SARS-CoV-2&#x2F;COVID-19 Vaccines (ENFORCE),0.01,0.01,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,3
