In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import os

import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.util import ngrams
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
tqdm.pandas()

# Relative n-grams frequencies

In this notebook we calculate which words appear specifically often in one class, and very rarely in other class. To calculate relative frequencies, we first count the number of documents in which an n-gram appear in per each class and represent this value as a percentage of all documents of a given class.

After that, the difference between appearance ratio of both classes is calculated and n-grams with highest difference scores printed out.

This allows to see specific words / bi-grams / tri-grams that appear mostly or only in one of the classes.



**Limitations:**
- Current low number of labeled samples does can cause the results not relevant due to the number of unlabeled data
- Imbalance between classes -- majority of labeled observations belong to "relevant" class. "non-relevant" class is misrepresented, therefore, we cannot be sure about the results of this analysis. The risk is that the calculated work-frequencies are not represented in the population.


## Loading Data

In [3]:
articles_path = '/Users/kuba/Desktop/PBL/EDA/articles/articles_01-01-2024_14h39m35s.csv'
trials_path = '/Users/kuba/Desktop/PBL/EDA/trials/trials_01-01-2024_14h39m35s.csv'

articles = pd.read_csv(articles_path, index_col=0)
trials = pd.read_csv(trials_path, index_col=0)

## Utilities

In [4]:
# Function to create n-grams and their frequencies
def n_gram_frequency(df, text_column, ngram_range=(2, 2), col_name='n-gram'):
    # Instantiate CountVectorizer for n-grams with binary=True
    vectorizer = CountVectorizer(ngram_range=ngram_range, binary=True)
    
    # Apply the vectorizer to the text documents
    X = vectorizer.fit_transform(df[text_column])
    
    # Sum up the occurrences of each n-gram across all documents
    frequencies = X.sum(axis=0)
    
    # Create a DataFrame with n-grams and their frequencies
    n_gram_df = pd.DataFrame(frequencies.T, index=vectorizer.get_feature_names_out(), columns=['frequency'])
    n_gram_df = n_gram_df.reset_index().rename(columns={'index': col_name})
    
    return n_gram_df

## Preprocessing

In [5]:
# drop rows with missing values in summary column
articles.dropna(subset=['summary'], inplace=True)
articles.dropna(subset=['relevant'], inplace=True)

In [6]:
# for summaries that have doi, keep only eveyrhting in the string after doi
articles['summary'] = articles['summary'].apply(lambda x: x.lower().split('abstract')[1] if 'abstract' in x.lower() else x)

In [7]:
def preprocess_lemma(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, list):
        words = [word for sublist in text for word in word_tokenize(sublist.lower())]
    else:
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(text.lower())

    return ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()])

In [8]:
articles.summary = articles.summary.apply(preprocess_lemma)

## Single word relative frequency analysis

In [9]:
word_freq_df = n_gram_frequency(articles, 'summary', ngram_range=(1, 1), col_name='word')

word_freq_rel_df = n_gram_frequency(articles[articles.relevant == True], 'summary', ngram_range=(1, 1), col_name='word')

word_freq_nrel_df = n_gram_frequency(articles[articles.relevant == False], 'summary', ngram_range=(1, 1), col_name='word')

word_freq_df = word_freq_df.merge(word_freq_rel_df.rename(columns={"frequency":"freq_relevant"}), left_on='word', right_on='word', how='left')
word_freq_df = word_freq_df.merge(word_freq_nrel_df.rename(columns={"frequency":"freq_non_relevant"}), left_on='word', right_on='word', how='left')

# fill na with 0 for words that are not in relevant or non relevant
word_freq_df.fillna(0, inplace=True)

word_freq_df['freq_relevant_ratio'] = word_freq_df['freq_relevant']/articles[articles.relevant == True].shape[0]
word_freq_df['freq_non_relevant_ratio'] = word_freq_df['freq_non_relevant']/articles[articles.relevant == False].shape[0]

word_freq_df['diff'] = np.abs(word_freq_df['freq_relevant_ratio'] - word_freq_df['freq_non_relevant_ratio'])

word_freq_df['relevant_class'] = word_freq_df['freq_relevant_ratio'] > word_freq_df['freq_non_relevant_ratio']


In [10]:
word_freq_df[word_freq_df['relevant_class'] == True].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,word,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
1688,cell,450,419.0,31.0,0.488915,0.143519,0.345396,True
7186,mouse,279,271.0,8.0,0.316219,0.037037,0.279182,True
3497,effect,432,391.0,41.0,0.456243,0.189815,0.266428,True
10378,sclerosis,783,669.0,114.0,0.78063,0.527778,0.252852,True
4051,experimental,231,228.0,3.0,0.266044,0.013889,0.252155,True
949,autoimmune,297,280.0,17.0,0.326721,0.078704,0.248017,True
7324,multiple,799,680.0,119.0,0.793466,0.550926,0.24254,True
7553,nervous,257,245.0,12.0,0.285881,0.055556,0.230325,True
2877,demyelination,210,207.0,3.0,0.24154,0.013889,0.227651,True
1727,central,268,253.0,15.0,0.295216,0.069444,0.225771,True


In [12]:
word_freq_df[word_freq_df['relevant_class'] == False].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,word,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
1017,background,208,136.0,72.0,0.158693,0.333333,0.17464,False
8441,patient,382,275.0,107.0,0.320887,0.49537,0.174484,False
1619,case,69,33.0,36.0,0.038506,0.166667,0.12816,False
281,age,93,54.0,39.0,0.063011,0.180556,0.117545,False
5497,included,96,57.0,39.0,0.066511,0.180556,0.114044,False
3023,diagnosis,48,21.0,27.0,0.024504,0.125,0.100496,False
5581,infection,52,25.0,27.0,0.029172,0.125,0.095828,False
450,among,84,51.0,33.0,0.05951,0.152778,0.093268,False
1596,care,30,8.0,22.0,0.009335,0.101852,0.092517,False
5022,higher,102,66.0,36.0,0.077013,0.166667,0.089654,False


### What words appear only in one class very often?

In [16]:
# relevant class
word_freq_df[(word_freq_df['relevant_class'] == True) & (word_freq_df['freq_non_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,word,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
5596,infiltration,66,66.0,0.0,0.077013,0.0,0.077013,True
1001,axon,64,64.0,0.0,0.074679,0.0,0.074679,True
2611,cuprizone,56,56.0,0.0,0.065344,0.0,0.065344,True
9275,promotes,43,43.0,0.0,0.050175,0.0,0.050175,True
10629,sheath,37,37.0,0.0,0.043174,0.0,0.043174,True
7606,neuroinflammatory,37,37.0,0.0,0.043174,0.0,0.043174,True
365,alleviated,36,36.0,0.0,0.042007,0.0,0.042007,True
1322,bloodbrain,34,34.0,0.0,0.039673,0.0,0.039673,True
9224,progenitor,33,33.0,0.0,0.038506,0.0,0.038506,True
1062,balance,33,33.0,0.0,0.038506,0.0,0.038506,True


In [17]:
# non-relevant class
word_freq_df[(word_freq_df['relevant_class'] == False) & (word_freq_df['freq_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,word,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
8198,osteoarthritis,6,0.0,6.0,0.0,0.027778,0.027778,False
9210,professional,6,0.0,6.0,0.0,0.027778,0.027778,False
12328,vaccinated,5,0.0,5.0,0.0,0.023148,0.023148,False
8328,palliative,5,0.0,5.0,0.0,0.023148,0.023148,False
7766,nodule,5,0.0,5.0,0.0,0.023148,0.023148,False
895,attending,4,0.0,4.0,0.0,0.018519,0.018519,False
1371,bony,4,0.0,4.0,0.0,0.018519,0.018519,False
8620,personal,4,0.0,4.0,0.0,0.018519,0.018519,False
11068,stay,4,0.0,4.0,0.0,0.018519,0.018519,False
9743,recurrence,4,0.0,4.0,0.0,0.018519,0.018519,False


### Most frequent words in each class (absolute frequency)

In [39]:
# relevant class
word_freq_df[word_freq_df.relevant_class == True].sort_values(by='freq_relevant', ascending=False).head(20)[['word', 'freq_relevant']].reset_index(drop=True)

Unnamed: 0,word,freq_relevant
0,multiple,680.0
1,sclerosis,669.0
2,disease,493.0
3,study,467.0
4,cell,419.0
5,treatment,392.0
6,effect,391.0
7,model,308.0
8,system,293.0
9,therapeutic,284.0


In [40]:
# non-relevant class
word_freq_df[word_freq_df.relevant_class == False].sort_values(by='freq_non_relevant', ascending=False).head(20)[['word', 'freq_non_relevant']].reset_index(drop=True)

Unnamed: 0,word,freq_non_relevant
0,patient,107.0
1,background,72.0
2,analysis,52.0
3,may,50.0
4,associated,49.0
5,significant,45.0
6,conclusion,44.0
7,data,43.0
8,compared,43.0
9,including,42.0


## Bi-grams relative frequency analysis

In [18]:
bi_gram_df = n_gram_frequency(articles, 'summary', col_name='bi-gram')

In [19]:
bi_gram_rel_df = n_gram_frequency(articles[articles.relevant == True], 'summary', col_name='bi-gram')

In [20]:
bi_gram_nrel_df = n_gram_frequency(articles[articles.relevant == False], 'summary', col_name='bi-gram')

In [21]:
bi_gram_df = bi_gram_df.merge(bi_gram_rel_df.rename(columns={"frequency": "freq_relevant"}), left_on='bi-gram', right_on='bi-gram', how='left')
bi_gram_df = bi_gram_df.merge(bi_gram_nrel_df.rename(columns={"frequency": "freq_non_relevant"}), left_on='bi-gram', right_on='bi-gram', how='left')

# fill na with 0 for bi-grams that are not in relevant or non-relevant
bi_gram_df.fillna(0, inplace=True)

bi_gram_df['freq_relevant_ratio'] = bi_gram_df['freq_relevant'] / articles[articles.relevant == True].shape[0]
bi_gram_df['freq_non_relevant_ratio'] = bi_gram_df['freq_non_relevant'] / articles[articles.relevant == False].shape[0]

bi_gram_df['diff'] = np.abs(bi_gram_df['freq_relevant_ratio'] - bi_gram_df['freq_non_relevant_ratio'])

bi_gram_df['relevant_class'] = bi_gram_df['freq_relevant_ratio'] > bi_gram_df['freq_non_relevant_ratio']


In [22]:
bi_gram_df[bi_gram_df['relevant_class'] == True].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,bi-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
50317,multiple sclerosis,752,656.0,96.0,0.765461,0.444444,0.321016,True
11220,central nervous,243,233.0,10.0,0.271879,0.046296,0.225582,True
51451,nervous system,251,239.0,12.0,0.27888,0.055556,0.223324,True
27914,experimental autoimmune,188,186.0,2.0,0.217036,0.009259,0.207777,True
6749,autoimmune encephalomyelitis,179,178.0,1.0,0.207701,0.00463,0.203072,True
25665,encephalomyelitis eae,149,148.0,1.0,0.172695,0.00463,0.168066,True
78011,system cns,128,125.0,3.0,0.145858,0.013889,0.131969,True
23864,eae mouse,96,95.0,1.0,0.110852,0.00463,0.106222,True
74445,spinal cord,125,117.0,8.0,0.136523,0.037037,0.099486,True
4205,animal model,95,91.0,4.0,0.106184,0.018519,0.087666,True


In [23]:
bi_gram_df[bi_gram_df['relevant_class'] == False].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,bi-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
70744,sclerosis pwms,22,8.0,14.0,0.009335,0.064815,0.05548,False
57364,people multiple,28,14.0,14.0,0.016336,0.064815,0.048479,False
43731,logistic regression,14,4.0,10.0,0.004667,0.046296,0.041629,False
33605,healthy control,29,16.0,13.0,0.01867,0.060185,0.041515,False
78198,systematic review,21,10.0,11.0,0.011669,0.050926,0.039257,False
10054,case presentation,8,0.0,8.0,0.0,0.037037,0.037037,False
67823,respiratory syndrome,13,4.0,9.0,0.004667,0.041667,0.036999,False
1392,acute respiratory,15,6.0,9.0,0.007001,0.041667,0.034665,False
16047,coronavirus disease,11,3.0,8.0,0.003501,0.037037,0.033536,False
72112,severe acute,12,4.0,8.0,0.004667,0.037037,0.03237,False


### Which bi grams appear only in one class?

In [24]:
# relevant class
bi_gram_df[(bi_gram_df['relevant_class'] == True) & (bi_gram_df['freq_non_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,bi-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
23858,eae model,41,41.0,0.0,0.047841,0.0,0.047841,True
46600,mesenchymal stem,36,36.0,0.0,0.042007,0.0,0.042007,True
50643,myelin sheath,36,36.0,0.0,0.042007,0.0,0.042007,True
10672,cell infiltration,35,35.0,0.0,0.04084,0.0,0.04084,True
8748,bloodbrain barrier,34,34.0,0.0,0.039673,0.0,0.039673,True
50606,myelin oligodendrocyte,33,33.0,0.0,0.038506,0.0,0.038506,True
70425,sclerosis autoimmune,30,30.0,0.0,0.035006,0.0,0.035006,True
10759,cell msc,30,30.0,0.0,0.035006,0.0,0.035006,True
50634,myelin repair,29,29.0,0.0,0.033839,0.0,0.033839,True
10801,cell opcs,28,28.0,0.0,0.032672,0.0,0.032672,True


In [25]:
# non-relevant class
bi_gram_df[(bi_gram_df['relevant_class'] == False) & (bi_gram_df['freq_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,bi-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
10054,case presentation,8,0.0,8.0,0.0,0.037037,0.037037,False
34096,higher risk,6,0.0,6.0,0.0,0.027778,0.027778,False
55571,palliative care,5,0.0,5.0,0.0,0.023148,0.023148,False
14347,computed tomography,4,0.0,4.0,0.0,0.018519,0.018519,False
62357,prospective cohort,4,0.0,4.0,0.0,0.018519,0.018519,False
82656,tuberous sclerosis,4,0.0,4.0,0.0,0.018519,0.018519,False
84600,vaccine dose,3,0.0,3.0,0.0,0.013889,0.013889,False
31173,fully vaccinated,3,0.0,3.0,0.0,0.013889,0.013889,False
79928,third vaccine,3,0.0,3.0,0.0,0.013889,0.013889,False
12479,clinical diagnosis,3,0.0,3.0,0.0,0.013889,0.013889,False


### Most frequent bi-grams in each class (absolute frequency)

In [41]:
# relevant class
bi_gram_df[bi_gram_df.relevant_class == True].sort_values(by='freq_relevant', ascending=False).head(20)[['bi-gram', 'freq_relevant']].reset_index(drop=True)

Unnamed: 0,bi-gram,freq_relevant
0,multiple sclerosis,656.0
1,nervous system,239.0
2,central nervous,233.0
3,experimental autoimmune,186.0
4,autoimmune encephalomyelitis,178.0
5,encephalomyelitis eae,148.0
6,system cns,125.0
7,spinal cord,117.0
8,eae mouse,95.0
9,autoimmune disease,92.0


In [42]:
# non-relevant class
bi_gram_df[bi_gram_df.relevant_class == False].sort_values(by='freq_non_relevant', ascending=False).head(20)[['bi-gram', 'freq_non_relevant']].reset_index(drop=True)

Unnamed: 0,bi-gram,freq_non_relevant
0,people multiple,14.0
1,sclerosis pwms,14.0
2,quality life,13.0
3,healthy control,13.0
4,significant difference,12.0
5,systematic review,11.0
6,adverse event,10.0
7,logistic regression,10.0
8,age year,10.0
9,disability status,9.0


## Tri-Gram relative frequencies

In [27]:
tri_gram_df = n_gram_frequency(articles, 'summary', ngram_range=(3, 3), col_name='tri-gram')

tri_gram_rel_df = n_gram_frequency(articles[articles.relevant == True], 'summary', ngram_range=(3, 3), col_name='tri-gram')

tri_gram_nrel_df = n_gram_frequency(articles[articles.relevant == False], 'summary', ngram_range=(3, 3), col_name='tri-gram')

In [28]:
tri_gram_df = tri_gram_df.merge(tri_gram_rel_df.rename(columns={"frequency": "freq_relevant"}), left_on='tri-gram', right_on='tri-gram', how='left')
tri_gram_df = tri_gram_df.merge(tri_gram_nrel_df.rename(columns={"frequency": "freq_non_relevant"}), left_on='tri-gram', right_on='tri-gram', how='left')

# fill na with 0 for tri-grams that are not in relevant or non-relevant
tri_gram_df.fillna(0, inplace=True)

tri_gram_df['freq_relevant_ratio'] = tri_gram_df['freq_relevant'] / articles[articles.relevant == True].shape[0]
tri_gram_df['freq_non_relevant_ratio'] = tri_gram_df['freq_non_relevant'] / articles[articles.relevant == False].shape[0]

tri_gram_df['diff'] = np.abs(tri_gram_df['freq_relevant_ratio'] - tri_gram_df['freq_non_relevant_ratio'])

tri_gram_df['relevant_class'] = tri_gram_df['freq_relevant_ratio'] > tri_gram_df['freq_non_relevant_ratio']


In [29]:
tri_gram_df[tri_gram_df['relevant_class'] == True].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,tri-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
14595,central nervous system,238,228.0,10.0,0.266044,0.046296,0.219748,True
36427,experimental autoimmune encephalomyelitis,176,175.0,1.0,0.204201,0.00463,0.199571,True
8461,autoimmune encephalomyelitis eae,148,147.0,1.0,0.171529,0.00463,0.166899,True
66949,nervous system cns,128,125.0,3.0,0.145858,0.013889,0.131969,True
28168,disease central nervous,61,59.0,2.0,0.068845,0.009259,0.059586,True
62163,model multiple sclerosis,46,46.0,0.0,0.053676,0.0,0.053676,True
59985,mesenchymal stem cell,34,34.0,0.0,0.039673,0.0,0.039673,True
33700,encephalomyelitis eae mouse,30,30.0,0.0,0.035006,0.0,0.035006,True
64924,multiple sclerosis autoimmune,29,29.0,0.0,0.033839,0.0,0.033839,True
28541,disease multiple sclerosis,65,57.0,8.0,0.066511,0.037037,0.029474,True


In [30]:
tri_gram_df[tri_gram_df['relevant_class'] == False].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,tri-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
65230,multiple sclerosis pwms,22,8.0,14.0,0.009335,0.064815,0.05548,False
75029,people multiple sclerosis,27,13.0,14.0,0.015169,0.064815,0.049646,False
1623,acute respiratory syndrome,12,4.0,8.0,0.004667,0.037037,0.03237,False
93582,severe acute respiratory,12,4.0,8.0,0.004667,0.037037,0.03237,False
87938,respiratory syndrome coronavirus,9,2.0,7.0,0.002334,0.032407,0.030074,False
11191,body mass index,5,1.0,4.0,0.001167,0.018519,0.017352,False
96632,spectrum disorder nmosd,5,1.0,4.0,0.001167,0.018519,0.017352,False
82609,quality life qol,7,3.0,4.0,0.003501,0.018519,0.015018,False
104131,third vaccine dose,3,0.0,3.0,0.0,0.013889,0.013889,False
56371,logistic regression analysis,3,0.0,3.0,0.0,0.013889,0.013889,False


### Which tri-grams appear only in each class?

In [31]:
# relevant class
tri_gram_df[(tri_gram_df['relevant_class'] == True) & (tri_gram_df['freq_non_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,tri-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
62163,model multiple sclerosis,46,46.0,0.0,0.053676,0.0,0.053676,True
59985,mesenchymal stem cell,34,34.0,0.0,0.039673,0.0,0.039673,True
33700,encephalomyelitis eae mouse,30,30.0,0.0,0.035006,0.0,0.035006,True
64924,multiple sclerosis autoimmune,29,29.0,0.0,0.033839,0.0,0.033839,True
65755,myelin oligodendrocyte glycoprotein,24,24.0,0.0,0.028005,0.0,0.028005,True
70714,oligodendrocyte progenitor cell,24,24.0,0.0,0.028005,0.0,0.028005,True
97613,stem cell msc,23,23.0,0.0,0.026838,0.0,0.026838,True
62044,model experimental autoimmune,23,23.0,0.0,0.026838,0.0,0.026838,True
76716,play important role,22,22.0,0.0,0.025671,0.0,0.025671,True
33698,encephalomyelitis eae model,22,22.0,0.0,0.025671,0.0,0.025671,True


In [32]:
# non-relevant class
tri_gram_df[(tri_gram_df['relevant_class'] == False) & (tri_gram_df['freq_relevant'] == 0)].sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,tri-gram,frequency,freq_relevant,freq_non_relevant,freq_relevant_ratio,freq_non_relevant_ratio,diff,relevant_class
56371,logistic regression analysis,3,0.0,3.0,0.0,0.013889,0.013889,False
63068,month second dose,3,0.0,3.0,0.0,0.013889,0.013889,False
104131,third vaccine dose,3,0.0,3.0,0.0,0.013889,0.013889,False
2069,adjusted age sex,3,0.0,3.0,0.0,0.013889,0.013889,False
101277,syndrome coronavirus vaccination,3,0.0,3.0,0.0,0.013889,0.013889,False
36744,expressed gene degs,3,0.0,3.0,0.0,0.013889,0.013889,False
101796,systematic review literature,3,0.0,3.0,0.0,0.013889,0.013889,False
53433,joanna briggs institute,2,0.0,2.0,0.0,0.009259,0.009259,False
88722,result study may,2,0.0,2.0,0.0,0.009259,0.009259,False
29894,dos mrna vaccine,2,0.0,2.0,0.0,0.009259,0.009259,False


### Most frequent tri-grams in each class (absolute frequency)

In [43]:
# relevant class
tri_gram_df[tri_gram_df.relevant_class == True].sort_values(by='freq_relevant', ascending=False).head(20)[['tri-gram', 'freq_relevant']].reset_index(drop=True)

Unnamed: 0,tri-gram,freq_relevant
0,central nervous system,228.0
1,experimental autoimmune encephalomyelitis,175.0
2,autoimmune encephalomyelitis eae,147.0
3,nervous system cns,125.0
4,disease central nervous,59.0
5,disease multiple sclerosis,57.0
6,model multiple sclerosis,46.0
7,patient multiple sclerosis,41.0
8,multiple sclerosis chronic,38.0
9,mesenchymal stem cell,34.0


In [44]:
# non-relevant class
tri_gram_df[tri_gram_df.relevant_class == False].sort_values(by='freq_non_relevant', ascending=False).head(20)[['tri-gram', 'freq_non_relevant']].reset_index(drop=True)

Unnamed: 0,tri-gram,freq_non_relevant
0,multiple sclerosis pwms,14.0
1,people multiple sclerosis,14.0
2,severe acute respiratory,8.0
3,relapsingremitting multiple sclerosis,8.0
4,expanded disability status,8.0
5,acute respiratory syndrome,8.0
6,respiratory syndrome coronavirus,7.0
7,multiple sclerosis patient,5.0
8,showed significant difference,4.0
9,scale eds score,4.0
